In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy import stats
from pathlib import Path
from pathlib import Path
import pandas as pd
import seaborn as sns
from typing import Dict, Tuple

We also configured display settings and paths to manage data files efficiently. Setting up the environment correctly ensures smooth data processing and visualization.



In [None]:
from pathlib import Path
import pandas as pd
import seaborn as sns

# Configuration
class Config:
    DATA_DIR = Path("../../data/raw/")
    OUTPUT_DIR = Path("../../data/processed")
    REGIONS = ['Benin', 'Sierraleon', 'Togo']
    SOLAR_COLS = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB']
    CLIMATE_COLS = ['Tamb', 'RH', 'WS', 'Precipitation']
    DATE_COL = 'Timestamp'

    @classmethod
    def setup(cls):
        cls.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Run setup
Config.setup()

# Set display and plot preferences
pd.set_option('display.float_format', '{:.2f}'.format)
sns.set_theme(style='whitegrid')


**Data Loading**

We start by bringing in raw datasets from Benin, Sierra Leone, and Togo, ensuring we have all the necessary data for analysis. Next, we merge these datasets with their respective region labels to maintain clarity. A quick initial inspection helps spot any obvious issues, such as missing values or unexpected formats.


In [17]:
import pandas as pd
from pathlib import Path

# Define directories
data_dir = Path("../data/raw")
output_file = Path("../data/processed/solar_data.csv")

# Define regions and their respective file names
regions = ["Benin", "Sierraleon", "Togo"]
dataframes = []

# Load each CSV and assign region label
for region in regions:
    file_path = data_dir / f"{region}_solar_data.csv"
    if file_path.exists():
        df = pd.read_csv(file_path, parse_dates=["Timestamp"])
        df["Region"] = region  # Add region column explicitly
        print(f"✅ Loaded {region}: {df.shape[0]} rows")
        dataframes.append(df)
    else:
        print(f"❌ File not found for region: {region} -> {file_path}")

# Merge all data and save
if dataframes:
    merged_df = pd.concat(dataframes, ignore_index=True)
    output_file.parent.mkdir(parents=True, exist_ok=True)
    merged_df.to_csv(output_file, index=False)
    print(f"\n✅ Merged dataset saved to: {output_file}")
    print(f"📊 Summary:\n{merged_df['Region'].value_counts()}")
else:
    print("⚠️ No files were loaded. Check your file paths.")


✅ Loaded Benin: 525600 rows
✅ Loaded Sierraleon: 525600 rows
✅ Loaded Togo: 525600 rows

✅ Merged dataset saved to: ..\data\processed\solar_data.csv
📊 Summary:
Region
Benin         525600
Sierraleon    525600
Togo          525600
Name: count, dtype: int64
