In [4]:
# Imports libraries
import os
import re
import time
import requests
import joblib
import pandas as pd
import numpy as np

# Utilities
from geopy.geocoders import Nominatim

# Ensure directories exist
for d in ["../data/raw", "../data/processed", "../data/final", "../models", "../reports"]:
    os.makedirs(d, exist_ok=True)


In [5]:

def clean_district_name(name):
    """Standardizes district names for consistent merging."""
    if pd.isna(name):
        return ""

    name = str(name).lower()

    # Remove text in brackets
    name = re.sub(r'\s*\([^)]*\)', '', name)

    # Remove common suffixes
    suffixes = [
        ' municipal', ' muni', ' district',
        ' metropolitan', ' metro'
    ]
    for suffix in suffixes:
        name = name.replace(suffix, '')

    # Normalize spacing and symbols
    name = name.replace('-', ' ')
    name = re.sub(r'\s+', ' ', name).strip()

    return name.title()


def get_soil_map():
    """Maps Ghana's administrative regions to their dominant soil types."""
    return {
        # Forest zones
        'Ashanti': 'Forest Ochrosol',
        'Brong Ahafo': 'Forest Ochrosol',
        'Bono': 'Forest Ochrosol',
        'Bono East': 'Forest Ochrosol',
        'Ahafo': 'Forest Ochrosol',
        'Eastern': 'Forest Ochrosol',
        'Western': 'Forest Ochrosol',
        'Western North': 'Forest Ochrosol',
        'Central': 'Forest Ochrosol',

        # Savanna zones
        'Northern': 'Savanna Ochrosol',
        'Savannah': 'Savanna Ochrosol',
        'North East': 'Savanna Ochrosol',
        'Upper East': 'Savanna Ochrosol',
        'Upper West': 'Savanna Ochrosol',
        'Oti': 'Savanna Ochrosol',

        # Coastal & special zones
        'Greater Accra': 'Coastal Savannah',
        'Volta': 'Tropical Black Earth'
    }


In [12]:

# A. LOAD RAW YIELD DATA
# We look for the file in data/raw or the root folder
raw_path = "../data/raw/maize_yield.csv"
if not os.path.exists(raw_path) and os.path.exists("maize_yield.csv"):
    raw_path = "maize_yield.csv" # Fallback for Colab root

if os.path.exists(raw_path):
    df_yield = pd.read_csv(raw_path)
    # Clean Columns
    df_yield.columns = [c.strip().title() for c in df_yield.columns]
    
    # Identify District Column dynamically
    dist_col = next((c for c in df_yield.columns if 'District' in c or 'DISTRICT' in c), None)
    if dist_col:
        df_yield.rename(columns={dist_col: 'District', 'YEAR': 'Year', 'YIELD': 'Yield'}, inplace=True)
        df_yield['District'] = df_yield['District'].apply(clean_district_name)
        print(f"Yield Data Loaded: {len(df_yield)} rows (2010-2016)")
    else:
        raise ValueError("Could not find 'District' column in CSV.")
else:
    raise FileNotFoundError("'maize_yield.csv' not found. Please upload it to data/raw/")

# B. IMPUTE MISSING YEARS (2017-2021)
# We generate new rows using National Averages to capture the PFJ Policy era
print("Imputing 2017-2021 data using National Averages...")
unique_districts = df_yield['District'].unique()

national_avgs = [
    {'Year': 2017, 'Yield': 2.04}, 
    {'Year': 2018, 'Yield': 2.25},
    {'Year': 2019, 'Yield': 2.53}, 
    {'Year': 2020, 'Yield': 2.58},
    {'Year': 2021, 'Yield': 2.53}
]
new_rows = [{'District': d, 'Year': entry['Year'], 'Yield': entry['Yield']} 
            for d in unique_districts for entry in national_avgs]
df_extended = pd.concat([df_yield[['District', 'Year', 'Yield']], pd.DataFrame(new_rows)], ignore_index=True)

# C. GET WEATHER DATA (NASA API)
weather_path = "../data/processed/Weather_Data.csv"

if os.path.exists(weather_path):
    df_weather = pd.read_csv(weather_path)
    print("Weather Data Loaded from cache.")
else:
    print("Weather file not found. Downloading from NASA POWER API...")
    geolocator = Nominatim(user_agent="ghana_maize_project_v1")
    weather_data = []
    base_url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    
    for district in unique_districts:
        try:
            # Geolocate
            loc = geolocator.geocode(f"{district}, Ghana", timeout=10)
            if loc:
                # API Request
                params = {
                    "parameters": "T2M,PRECTOTCORR,RH2M,ALLSKY_SFC_SW_DWN,GWETTOP",
                    "community": "AG", "longitude": loc.longitude, "latitude": loc.latitude,
                    "start": "20100101", "end": "20211231", "format": "JSON"
                }
                resp = requests.get(base_url, params=params)
                
                if resp.status_code == 200:
                    d = resp.json()['properties']['parameter']
                    temp_df = pd.DataFrame({
                        'Date': pd.to_datetime(list(d['T2M'].keys())),
                        'Rainfall': list(d['PRECTOTCORR'].values()),
                        'Temperature': list(d['T2M'].values()),
                        'Humidity': list(d['RH2M'].values()),
                        'Sunlight': list(d['ALLSKY_SFC_SW_DWN'].values()),
                        'Soil_Moisture': list(d['GWETTOP'].values()),
                        'District': district
                    })
                    weather_data.append(temp_df)
            time.sleep(1) # Be polite to API
        except Exception as e:
            print(f"Error for {district}: {e}")
    
    if weather_data:
        df_weather = pd.concat(weather_data)
        df_weather.to_csv(weather_path, index=False)
        print("Weather Download Complete.")
    else:
        raise ValueError("Failed to download weather data.")







Yield Data Loaded: 1681 rows (2010-2016)
Imputing 2017-2021 data using National Averages...
Weather file not found. Downloading from NASA POWER API...
Weather Download Complete.


In [13]:


# SEASONAL WEATHER AGGREGATION (APRIL–AUGUST)

df_weather["Date"] = pd.to_datetime(df_weather["Date"])
df_weather["Year"] = df_weather["Date"].dt.year
df_weather["Month"] = df_weather["Date"].dt.month

season_weather = df_weather[df_weather["Month"].between(4, 8)]

weather_agg = (
    season_weather
    .groupby(["District", "Year"], as_index=False)
    .agg({
        "Rainfall": "sum",
        "Temperature": "mean",
        "Humidity": "mean",
        "Sunlight": "mean",
        "Soil_Moisture": "mean"
    })
)


# ADD SOIL DATA
# Map region → dominant soil type

if "Region" in df_yield.columns:
    df_soil = (
        df_yield[["District", "Region"]]
        .drop_duplicates()
        .assign(
            Region=lambda x: x["Region"].str.title().str.strip(),
            Soil_Type=lambda x: x["Region"]
                .map(get_soil_map())
                .fillna("Forest Ochrosol")
        )
    )
else:
    df_soil = pd.DataFrame({
        "District": unique_districts,
        "Soil_Type": "Forest Ochrosol"
    })



In [14]:

# Merge
df = pd.merge(df_extended, weather_agg, on=['District', 'Year'], how='inner')
df = pd.merge(df, df_soil[['District', 'Soil_Type']], on='District', how='left')

# 1. Winsorization (Cap Outliers)
# df['Yield'] = np.where(df['Yield'] > 4.0, 4.0, df['Yield'])
# 2. Add Event Shocks
df['Pest_Risk'] = np.where(df['Year'].isin([2016, 2017]), 1, 0)
df['PFJ_Policy'] = np.where(df['Year'] >= 2017, 1, 0)

# 3. Add Lag Features (Crucial Step)
df.sort_values(['District', 'Year'], inplace=True)
df['Yield_Lag1'] = df.groupby('District')['Yield'].shift(1)
# Remove rows with NaN (First year has no history)
final_df = df.dropna().reset_index(drop=True)

# Save Master Dataset
final_path = "../data/final/Ghana_Maize.csv"
final_df.to_csv(final_path, index=False)
print(f"Master Dataset Created: {final_df.shape}")


Master Dataset Created: (1775, 12)
