In [60]:
from pathlib import Path
import pandas as pd
import sys
ROOT = Path().resolve().parent
src_folder = ROOT / "src"
if str(src_folder) not in sys.path:
    sys.path.insert(0, str(src_folder))
from data_workflow.viz import create_bar, create_line, create_histogram, SaveFig
from data_workflow.utils import bootstrap_diff_means

In [61]:
DATA = ROOT / "data/processed/analytics_table.parquet"
# How many rows and columns 
df = pd.read_parquet(DATA)
df.shape


(5, 18)

In [62]:
audit = pd.DataFrame({"dtype": df.dtypes.astype(str),"missing_count": df.isna().sum(),"missing_percentage": df.isna().mean(),
}).sort_values("missing_percentage", ascending=False)
# missing values in each column
audit.head(5)

Unnamed: 0,dtype,missing_count,missing_percentage
date,object,1,0.2
year,float64,1,0.2
amount_w,Float64,1,0.2
hour,float64,1,0.2
dow,float64,1,0.2


In [64]:
revenue_by_country = (df.groupby("country", dropna=False)["amount"].sum().reset_index())

barchart = create_bar(revenue_by_country, x_col="country",y_col="amount",plot_title="Revenue by country (all time)",)

FIGS = ROOT / "reports/figures"
FIGS.mkdir(parents=True, exist_ok=True)
# highest total revenue
SaveFig(barchart, FIGS / "revenue_by_country.png")
barchart


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [65]:
#monthly trend

monthly_trend = (df.groupby("month", dropna=False)["amount"].sum().reset_index().sort_values("month"))
fig = create_line(monthly_trend,x_col="month",y_col="amount",plot_title="Monthly revenue trend",)
SaveFig(fig, FIGS / "monthly_revenue_trend.png")
fig


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [58]:
df["month"].nunique()


1

In [None]:
#distribution of order amounts winsorized
histogram = create_histogram(df,column="amount_w",bins=30,plot_title="Distribution of order amounts (winsorized)",)
SaveFig(histogram, FIGS / "amount_distribution.png")
histogram

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
refund_data = df.assign(is_refund=(df["status_clean"] == "refund").astype(int))
#statistically meaningful difference in refund rates between SA and AE?
sa_rates = refund_data.loc[refund_data["country"] == "SA", "is_refund"]
ae_rates = refund_data.loc[refund_data["country"] == "AE", "is_refund"]
analysis = bootstrap_diff_means(sa_rates, ae_rates, n_boot=2000, seed=0)
analysis

{'diff_mean': -1.0, 'ci_lower': -1.0, 'ci_upper': -1.0}

In [54]:
df.groupby("country")["amount"].sum()


country
AE      0.0
SA    145.5
Name: amount, dtype: Float64

### Interpretation

- Saudi Arabia generates higher total revenue compared to the UAE.
- Revenue changes across months, which may indicate possible seasonality.
- The distribution of order amounts is right-skewed; winsorization helps reduce the effect of extreme values.
- The bootstrap analysis estimates the difference in refund rates between Saudi Arabia and the UAE.
- The confidence interval helps determine whether the observed difference is statistically meaningful.




In [68]:
import pandas as pd

# اقرأ جدول التحليلات النهائي
df = pd.read_parquet("data/processed/analytics_table.parquet")

print("عدد الصفوف:", len(df))
print("-" * 40)

# -------------------
# 1) المدة الزمنية
# -------------------
min_date = df["created_at"].min()
max_date = df["created_at"].max()

print("الفترة الزمنية:")
print("من:", min_date)
print("إلى:", max_date)
print("-" * 40)

# -------------------
# 2) الإيرادات
# -------------------
total_revenue = df.loc[df["status_clean"] == "paid", "amount"].sum()
avg_order_value = df["amount"].mean()
median_order_value = df["amount"].median()

print("الإيرادات:")
print("إجمالي الإيرادات:", round(total_revenue, 2))
print("متوسط قيمة الطلب (AOV):", round(avg_order_value, 2))
print("الوسيط:", round(median_order_value, 2))
print("-" * 40)

# -------------------
# 3) الاسترجاع (Refund)
# -------------------
refund_rate = (df["status_clean"] == "refund").mean() * 100

print("الاسترجاع:")
print("نسبة الطلبات المسترجعة (%):", round(refund_rate, 2))
print("-" * 40)

# -------------------
# 4) البيانات المفقودة
# -------------------
missing_created_at = df["created_at"].isna().mean() * 100
missing_quantity = df["quantity"].isna().mean() * 100

print("القيم المفقودة:")
print("created_at مفقود (%):", round(missing_created_at, 2))
print("quantity مفقود (%):", round(missing_quantity, 2))
print("-" * 40)

# -------------------
# 5) الربط مع جدول المستخدمين
# -------------------
country_missing = df["country"].isna().mean() * 100
country_match_rate = 100 - country_missing

print("الربط مع users:")
print("نسبة التطابق مع country (%):", round(country_match_rate, 2))
print("-" * 40)

# -------------------
# 6) القيم الشاذة
# -------------------
outliers_count = df["amount_outlier"].sum()

print("القيم الشاذة:")
print("عدد الطلبات الشاذة:", int(outliers_count))


FileNotFoundError: [Errno 2] No such file or directory: 'data/processed/analytics_table.parquet'