In [4]:
import pandas as pd
import numpy as np
import os

In [None]:

# Load the dataset
df = pd.read_csv('../data/Crop_recommendation_ethiopia_real.csv')

# 1. Derive ALTITUDE from PS (surface pressure) using standard barometric formula
# MERRA-2 PS is in Pa; convert to hPa
df['PS_hPa'] = df['PS'] / 100

# Standard sea-level pressure = 1013.25 hPa
sea_level = 1013.25
df['altitude_m'] = 44330 * (1 - (df['PS_hPa'] / sea_level) ** 0.1903)  # Approximate meters above sea level 

print("Altitude sample (m):", df['altitude_m'].describe())  # Should show realistic Ethiopian range (~500â€“3500m)

# Average temperature (mean of all max/min across seasons)
temp_cols = [col for col in df.columns if 'T2M' in col]
df['temperature'] = df[temp_cols].mean(axis=1)

# Average humidity from QV2M (specific humidity as proxy)
hum_cols = [col for col in df.columns if 'QV2M' in col]
df['humidity'] = df[hum_cols].mean(axis=1) * 1000  # Optional scale if needed (QV2M is kg/kg, often small)

# Annual rainfall approximation (sum seasonal precipitation)
rain_cols = [col for col in df.columns if 'PRECTOTCORR' in col]
df['rainfall'] = df[rain_cols].sum(axis=1)

# Keep direct: N, P, K, ph already single values

# 3. Add other relevant features
df['Zn'] = df['Zn']  # Already there
df['S'] = df['S']
df['soil_moisture'] = df['GWETTOP']  # Topsoil wetness 

# Optional: Encode Soilcolor (one-hot or label encode)
df = pd.get_dummies(df, columns=['Soilcolor'], prefix='soilcolor')
df = df.rename(columns={'Ph': 'ph'})  # Standardize to lowercase 'ph' for consistency

# 4. Select final features for modeling (old 7 + new)
feature_cols = ['N', 'P', 'K', 'ph', 'temperature', 'humidity', 'rainfall', 
                'altitude_m', 'Zn', 'S', 'soil_moisture']
df=df[['N', 'P', 'K', 'ph', 'temperature', 'humidity', 'rainfall', 
                'altitude_m', 'Zn', 'S', 'soil_moisture','label']]


X = df[feature_cols]
y = df['label']

print("\nNew engineered features shape:", X.shape)
print("Sample:\n", X.head())

# Save processed for modeling
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/engineered.csv',index=False)
X.to_csv('../data/processed/X_engineered.csv', index=False)
pd.DataFrame(y).to_csv('../data/processed/y.csv', index=False)

print("New feature shape:", X.shape)
print("Sample features:\n", X.head())

Altitude sample (m): count     3867.000000
mean     32990.465662
std         65.144229
min      32846.778606
25%      32957.289113
50%      32994.519582
75%      33025.017679
max      33109.445967
Name: altitude_m, dtype: float64

New engineered features shape: (3867, 11)
Sample:
       N       P        K    ph  temperature      humidity   rainfall  \
0  0.23   5.401  738.231  5.81    16.512917  10024.166666  24.916667   
1  0.23  10.478  606.382  5.43    16.512917  10024.166666  24.916667   
2  0.23   6.847  386.580  5.41    16.512917  10024.166666  24.916667   
3  0.23   3.418  207.086  5.65    16.512917  10024.166666  24.916667   
4  0.23  39.282  317.357  5.27    16.512917  10024.166666  24.916667   

     altitude_m        Zn       S  soil_moisture  
0  33028.366005  2.976000  13.816           0.73  
1  33028.366005  3.077000  16.421           0.73  
2  33028.366005  6.611000  16.557           0.73  
3  33028.366005  0.460181  16.075           0.73  
4  33028.366005  2.743000  12.