In [8]:
# Cell 1: Imports and load engineered data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
import joblib
import os

# Load engineered data from previous notebook
df = pd.read_csv('../data/processed/engineered.csv')

print("Original shape before merging:", df.shape)
print("Original class distribution:\n", df['label'].value_counts())

Original shape before merging: (3867, 12)
Original class distribution:
 label
Teff          1260
Maize          732
Wheat          715
Barley         503
Bean           253
Pea             94
Sorghum         72
Dagussa         71
Niger seed      64
Potato          48
Red Pepper      29
Fallow          26
Name: count, dtype: int64


In [9]:
# Cell 2: First merging of rare classes
replace_map = {
    'Pea': 'Pulses',
    'Bean': 'Pulses',
    'Niger seed': 'Oilseeds',
    'Potato': 'Other_Specialty',
    'Red Pepper': 'Other_Specialty',
    'Fallow': 'Other_Specialty',
    'Dagussa': 'Minor_Cereals',
    'Sorghum': 'Minor_Cereals'
}
df['label'] = df['label'].replace(replace_map)

print("\nDistribution after first merging:\n", df['label'].value_counts())


Distribution after first merging:
 label
Teff               1260
Maize               732
Wheat               715
Barley              503
Pulses              347
Minor_Cereals       143
Other_Specialty     103
Oilseeds             64
Name: count, dtype: int64


In [10]:
# Cell 3: Additional merging (group Minor_Cereals and Specialty)
additional_map = {
    'Minor_Cereals': 'Cereals',
    'Oilseeds': 'Specialty',
    'Other_Specialty': 'Specialty'
}
df['label'] = df['label'].replace(additional_map)

print("\nDistribution after additional merging:\n", df['label'].value_counts())


Distribution after additional merging:
 label
Teff         1260
Maize         732
Wheat         715
Barley        503
Pulses        347
Specialty     167
Cereals       143
Name: count, dtype: int64


In [11]:
# Cell 4: Manual augmentation for remaining rare classes (safe version)
rare_threshold = 200   # Classes with <200 samples → oversample
target_samples = 400   # Target size per rare class

# Numerical columns (define explicitly - all except label)
numeric_cols = ['N', 'P', 'K', 'ph', 'temperature', 'humidity', 'rainfall',
                'altitude_m', 'Zn', 'S', 'soil_moisture']

print("\nDistribution before augmentation:\n", df['label'].value_counts())

augmented_dfs = [df]  # Start with original data

for label in df['label'].unique():
    class_df = df[df['label'] == label].copy()
    current_count = len(class_df)
    
    if current_count < rare_threshold:
        print(f"Oversampling {label}: {current_count} → {target_samples}")
        
        # Duplicate to target
        multiples = target_samples // current_count
        extra = target_samples % current_count
        
        augmented = pd.concat([class_df] * multiples, ignore_index=True)
        if extra > 0:
            augmented = pd.concat([augmented, class_df.sample(n=extra, random_state=42)], ignore_index=True)
        
        # Safe noise addition
        std_values = class_df[numeric_cols].std(axis=0)
        noise_level = 0.05
        noise = np.random.normal(0, 1, augmented[numeric_cols].shape)
        noisy_values = augmented[numeric_cols].values + noise * std_values.values * noise_level
        
        augmented.loc[:, numeric_cols] = noisy_values
        augmented.loc[:, numeric_cols] = augmented[numeric_cols].clip(lower=0)  # No negatives
        
        augmented_dfs.append(augmented)

# Combine and shuffle
df_aug = pd.concat(augmented_dfs, ignore_index=True)
df_aug = shuffle(df_aug, random_state=42)

print("\nDistribution after augmentation:\n", df_aug['label'].value_counts())
print("New total samples:", len(df_aug))

# Overwrite df for downstream steps
df = df_aug


Distribution before augmentation:
 label
Teff         1260
Maize         732
Wheat         715
Barley        503
Pulses        347
Specialty     167
Cereals       143
Name: count, dtype: int64
Oversampling Cereals: 143 → 400
Oversampling Specialty: 167 → 400

Distribution after augmentation:
 label
Teff         1260
Maize         732
Wheat         715
Specialty     567
Cereals       543
Barley        503
Pulses        347
Name: count, dtype: int64
New total samples: 4667


In [12]:
# Cell 5: Optional - merge major cereals into one class (further balance)
major_map = {
    'Teff': 'Major_Cereals',
    'Maize': 'Major_Cereals',
    'Wheat': 'Major_Cereals',
    'Barley': 'Major_Cereals'
}
df['label'] = df['label'].replace(major_map)

print("\nFinal class distribution (after optional major merging):\n", df['label'].value_counts())


Final class distribution (after optional major merging):
 label
Major_Cereals    3210
Specialty         567
Cereals           543
Pulses            347
Name: count, dtype: int64


In [13]:
# Cell 6: Save the merged/augmented dataframe
os.makedirs('../data/processed', exist_ok=True)
df.to_csv('../data/processed/engineered_merged_aug.csv', index=False)
print("Merged + augmented data saved as 'engineered_merged_aug.csv'")

Merged + augmented data saved as 'engineered_merged_aug.csv'


In [14]:
# Cell 7: Separate X/y, encode label, scale features
X = df.drop('label', axis=1)
y = df['label']

# Encode target
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, '../models/label_encoder_merged.pkl')
print("Encoded classes:", dict(zip(le.classes_, le.transform(le.classes_))))

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, '../models/scaler_merged.pkl')

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
print("\nScaled features sample:\n", X_scaled_df.head())

Encoded classes: {'Cereals': np.int64(0), 'Major_Cereals': np.int64(1), 'Pulses': np.int64(2), 'Specialty': np.int64(3)}

Scaled features sample:
           N         P         K        ph  temperature  humidity  rainfall  \
0  0.715413 -0.071124 -0.572794 -0.819682    -0.390604  0.186867  0.758555   
1  1.031750  0.189403  0.054317 -0.072808    -0.487255  0.023545  0.804156   
2  0.139727 -0.246996 -0.150420 -1.192321     1.171313  1.025549  0.303969   
3 -0.875674 -0.282722 -0.445275  0.353696    -0.548101 -0.973524 -0.587838   
4  1.406680 -0.270132 -0.321614 -0.001724    -0.133895  0.616101  1.380241   

   altitude_m        Zn         S  soil_moisture  
0    0.088061 -0.366371  1.733051       0.294138  
1    0.532303  0.156307 -1.173513       1.078132  
2   -1.190368  0.641999  0.588476      -0.173013  
3    0.860320 -0.522588 -0.444145       0.527566  
4   -0.025905  0.563644  0.121116       1.628699  


In [15]:
# Cell 8: Stratified splits and save everything
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("\nSplit sizes:")
print(f"Train: {X_train.shape[0]} | Val: {X_val.shape[0]} | Test: {X_test.shape[0]}")

# Save X splits
pd.DataFrame(X_train, columns=X.columns).to_csv('../data/processed/X_train_merged.csv', index=False)
pd.DataFrame(X_val, columns=X.columns).to_csv('../data/processed/X_val_merged.csv', index=False)
pd.DataFrame(X_test, columns=X.columns).to_csv('../data/processed/X_test_merged.csv', index=False)

# Save y encoded
pd.DataFrame(y_train, columns=['label_encoded']).to_csv('../data/processed/y_train_merged.csv', index=False)
pd.DataFrame(y_val, columns=['label_encoded']).to_csv('../data/processed/y_val_merged.csv', index=False)
pd.DataFrame(y_test, columns=['label_encoded']).to_csv('../data/processed/y_test_merged.csv', index=False)

# Save human-readable crop names
pd.DataFrame(le.inverse_transform(y_train), columns=['crop']).to_csv('../data/processed/y_train_names_merged.csv', index=False)
pd.DataFrame(le.inverse_transform(y_val), columns=['crop']).to_csv('../data/processed/y_val_names_merged.csv', index=False)
pd.DataFrame(le.inverse_transform(y_test), columns=['crop']).to_csv('../data/processed/y_test_names_merged.csv', index=False)

print("All merged files saved with '_merged' suffix!")


Split sizes:
Train: 3266 | Val: 700 | Test: 701
All merged files saved with '_merged' suffix!
