In [None]:
import pandas as pd


datasets = {
    "Benin": pd.read_csv("../data/benin-malanville.csv"),
    "Sierra Leone": pd.read_csv("../data/sierraleone-bumbuna.csv"),
    "Togo": pd.read_csv("../data/togo-dapaong_qc.csv"),
}

for name, df in datasets.items():
    print(f"Dataset: {name}")
    print(df.info())
    print(df.describe())


In [None]:
for name, df in datasets.items():
    print(f"Summary for {name}:")
    print(df.describe())


In [None]:
for name, df in datasets.items():
    print(f"Missing Values in {name}:")
    print(df.isnull().sum())


In [None]:
for name, df in datasets.items():
    invalid_ghi = df[df["GHI"] < 0]
    print(f"Invalid GHI values in {name}:\n", invalid_ghi)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for name, df in datasets.items():
    sns.boxplot(data=df[["GHI", "DNI", "DHI"]])
    plt.title(f"Outliers in {name}")
    plt.show()


In [None]:
for name, df in datasets.items():
    df["Timestamp"] = pd.to_datetime(df["Timestamp"])
    df.set_index("Timestamp", inplace=True)
    df[["GHI", "DNI", "DHI"]].plot(title=f"Time Series for {name}")
    plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

for name, df in datasets.items():
    correlation = df.corr()
    
    mask = np.triu(np.ones_like(correlation, dtype=bool))
    
    plt.figure(figsize=(10, 8))  
    sns.heatmap(correlation, 
                annot=True, 
                cmap="coolwarm", 
                fmt=".2f",  
                annot_kws={"size": 10},  
                mask=mask,  
                linewidths=0.5,  
                cbar_kws={"shrink": 0.8})  
    
    plt.title(f"Correlation Heatmap for {name}")
    plt.show()


In [None]:
for name, df in datasets.items():
    sns.scatterplot(x=df["WS"], y=df["WD"], hue=df["GHI"])
    plt.title(f"Wind Speed and Direction for {name}")
    plt.show()


In [None]:
#Investigate RH's impact:
for name, df in datasets.items():
    sns.scatterplot(x=df["RH"], y=df["GHI"])
    plt.title(f"Relative Humidity vs. GHI for {name}")
    plt.show()


In [None]:
import matplotlib.pyplot as plt

bins = 20  
for name, df in datasets.items():
    plt.figure(figsize=(8, 6))  
    df["GHI"].hist(bins=bins, color="skyblue", edgecolor="black", alpha=0.7)
    
    plt.title(f"Histogram of GHI for {name}", fontsize=14)
    plt.xlabel("GHI (Global Horizontal Irradiance)", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.grid(axis="y", linestyle="--", alpha=0.7)  
    
    plt.show()


In [None]:
#Z-Score Analysis
from scipy.stats import zscore

for name, df in datasets.items():
    df["z_ghi"] = zscore(df["GHI"].dropna())
    anomalies = df[df["z_ghi"] > 3]
    print(f"Anomalies in {name}:\n", anomalies)


In [None]:
def clean_data(df):
    df.fillna(method="ffill", inplace=True)
    df = df[df["GHI"] >= 0]  
    return df

cleaned_data = {name: clean_data(df) for name, df in datasets.items()}



In [None]:
# save cleaned data set
for name, df in cleaned_data.items():
    df.to_csv(f"../results/cleaned_{name.lower().replace(' ', '_')}.csv", index=False)
