In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kruskal

# Load cleaned datasets
benin = pd.read_csv("data/benin_clean.csv")
sierra_leone = pd.read_csv("data/sierraleone_clean.csv")
togo = pd.read_csv("data/togo_clean.csv")

# Add country labels
benin["Country"] = "Benin"
sierra_leone["Country"] = "Sierra Leone"
togo["Country"] = "Togo"

# Combine into a single DataFrame
df = pd.concat([benin, sierra_leone, togo], ignore_index=True)

In [None]:
sns.set(style="whitegrid")
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

for ax, metric in zip(axes, ['GHI', 'DNI', 'DHI']):
    sns.boxplot(x='Country', y=metric, data=df, ax=ax, palette="Set2")
    ax.set_title(f'{metric} Distribution by Country')

plt.tight_layout()
plt.show()

In [None]:
summary = df.groupby("Country")[["GHI", "DNI", "DHI"]].agg(['mean', 'median', 'std'])
summary.columns = ['_'.join(col) for col in summary.columns]
summary

In [None]:
ghi_benin = df[df["Country"] == "Benin"]["GHI"]
ghi_sierra = df[df["Country"] == "Sierra Leone"]["GHI"]
ghi_togo = df[df["Country"] == "Togo"]["GHI"]

stat, p_value = kruskal(ghi_benin, ghi_sierra, ghi_togo)
print(f"Kruskal-Wallis H-statistic: {stat:.4f}, p-value: {p_value:.4f}")

In [None]:
avg_ghi = df.groupby("Country")["GHI"].mean().sort_values(ascending=False)

plt.figure(figsize=(6, 4))
sns.barplot(x=avg_ghi.values, y=avg_ghi.index, palette="viridis")
plt.title("Average GHI by Country")
plt.xlabel("GHI (W/m²)")
plt.tight_layout()
plt.show()

### Key Observations

- **Togo** shows the highest **average GHI**, indicating strong solar potential overall.
- **Sierra Leone** has the **lowest median GHI** and relatively low variability.
- **Benin** exhibits moderate GHI but shows the **greatest spread**, suggesting high variability across the year.

The Kruskal-Wallis test yielded a **p-value < 0.05**, indicating that GHI differences among the countries are **statistically significant**.