# WHO Physical Activity — Global EDA

Dataset: WHO Global Health Observatory — *Prevalence of insufficient physical activity among adults (18+), %*.

This notebook answers:
1) global average
2) differences by region
3) differences by sex
4) trend over time
5) top/bottom countries
6) clustering of countries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

DATA_PATH = "insufficient_activity.csv"
df_raw = pd.read_csv(DATA_PATH)
df_raw.head()


In [None]:
df_raw.info()

In [None]:
# Basic missingness check
df_raw.isna().sum().sort_values(ascending=False).head(20)


In [None]:
# Parameters
YEAR = 2019

# Keep one row per country: Both sexes, selected year
df = (
    df_raw[(df_raw["Period"] == YEAR) & (df_raw["Dim1"] == "Both sexes")]
    .loc[:, ["Location", "FactValueNumeric"]]
    .rename(columns={"Location": "country", "FactValueNumeric": "insufficient_activity"})
    .dropna(subset=["insufficient_activity"])
)

df["sufficient_activity"] = 100 - df["insufficient_activity"]
df.shape, df.head()


## Q1. Global average (selected year)

In [None]:
df["insufficient_activity"].describe()


In [None]:
plt.figure(figsize=(8,5))
plt.hist(df["sufficient_activity"], bins=20)
plt.xlabel("Sufficient physical activity (%)")
plt.ylabel("Number of countries")
plt.title(f"Distribution of sufficient physical activity across countries ({YEAR})")
plt.show()


## Q2. Differences by region

In [None]:
# Region analysis (if ParentLocation is available)
if "ParentLocation" in df_raw.columns:
    df_region = (
        df_raw[(df_raw["Period"] == YEAR) & (df_raw["Dim1"] == "Both sexes")]
        .groupby("ParentLocation")["FactValueNumeric"]
        .mean()
        .reset_index()
        .rename(columns={"ParentLocation": "region", "FactValueNumeric": "avg_insufficient_activity"})
        .dropna()
    )
    df_region["avg_sufficient_activity"] = 100 - df_region["avg_insufficient_activity"]
    df_region.sort_values("avg_sufficient_activity", ascending=False)
else:
    df_region = None
    print("Column 'ParentLocation' not found; region analysis skipped.")


In [None]:
if df_region is not None:
    plt.figure(figsize=(8,5))
    sns.barplot(data=df_region, x="avg_sufficient_activity", y="region")
    plt.xlabel("Average sufficient activity (%)")
    plt.ylabel("Region")
    plt.title(f"Average sufficient activity by region ({YEAR})")
    plt.show()


## Q3. Differences by sex

In [None]:
df_gender = (
    df_raw[(df_raw["Period"] == YEAR) & (df_raw["Dim1"].isin(["Male", "Female"]))]
    .groupby("Dim1")["FactValueNumeric"]
    .mean()
    .reset_index()
    .rename(columns={"Dim1": "gender", "FactValueNumeric": "avg_insufficient_activity"})
    .dropna()
)
df_gender["avg_sufficient_activity"] = 100 - df_gender["avg_insufficient_activity"]
df_gender


In [None]:
plt.figure(figsize=(5,4))
sns.barplot(data=df_gender, x="gender", y="avg_sufficient_activity")
plt.ylabel("Average sufficient activity (%)")
plt.title(f"Sufficient activity by gender ({YEAR})")
plt.show()


## Q4. Trend over time

In [None]:
df_trend = (
    df_raw[df_raw["Dim1"] == "Both sexes"]
    .groupby("Period")["FactValueNumeric"]
    .mean()
    .reset_index()
    .rename(columns={"Period": "year", "FactValueNumeric": "avg_insufficient_activity"})
    .dropna()
)
df_trend["avg_sufficient_activity"] = 100 - df_trend["avg_insufficient_activity"]
df_trend.head()


In [None]:
plt.figure(figsize=(8,5))
plt.plot(df_trend["year"], df_trend["avg_sufficient_activity"], marker="o")
plt.xlabel("Year")
plt.ylabel("Average sufficient activity (%)")
plt.title("Global trend in sufficient physical activity (Both sexes)")
plt.show()


## Q5. Top / bottom countries

In [None]:
bottom10 = df.nsmallest(10, "sufficient_activity")
top10 = df.nlargest(10, "sufficient_activity")

bottom10, top10


In [None]:
plt.figure(figsize=(8,3))
sns.boxplot(x=df["sufficient_activity"])
plt.xlabel("Sufficient physical activity (%)")
plt.title(f"Boxplot of sufficient physical activity across countries ({YEAR})")
plt.show()


## Q6. Clustering countries (1D feature)

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = df[["sufficient_activity"]]
X_scaled = StandardScaler().fit_transform(X)

kmeans = KMeans(n_clusters=3, random_state=42, n_init="auto")
df["cluster"] = kmeans.fit_predict(X_scaled)

df["cluster"].value_counts()


In [None]:
plt.figure(figsize=(8,4))
sns.scatterplot(x=df.index, y=df["sufficient_activity"], hue=df["cluster"])
plt.xlabel("Country index")
plt.ylabel("Sufficient physical activity (%)")
plt.title(f"Clusters of countries by physical activity level ({YEAR})")
plt.show()


## Conclusions

- Physical activity differs substantially across countries.
- Region and gender comparisons (when available) highlight systematic differences.
- Global trend plot summarizes changes over time.
- Clustering provides a simple segmentation of countries by activity level.

## Limitations

- Single-dataset, descriptive analysis. No causal claims.
- Country-level averages hide within-country inequality.
