In [1]:
import pandas as pd
import numpy as np

# Load original dataset
df = pd.read_csv('../data/Crop_recommendation.csv')

# Step 1: Filter to only Ethiopian-relevant crops from original
kept_crops = ['maize', 'chickpea', 'lentil', 'coffee', 'cotton','rice','banana','mango','watermelon','orange']
df_kept = df[df['label'].isin(kept_crops)].copy()
print("Kept rows from original:", df_kept.shape)
print("Kept class distribution:\n", df_kept['label'].value_counts())

# Step 2: Function to generate synthetic data
def generate_crop_data(crop_name, n_samples=150, ranges=None):
    data = {
        'N': np.random.uniform(ranges['N'][0], ranges['N'][1], n_samples),
        'P': np.random.uniform(ranges['P'][0], ranges['P'][1], n_samples),
        'K': np.random.uniform(ranges['K'][0], ranges['K'][1], n_samples),
        'temperature': np.random.uniform(ranges['temp'][0], ranges['temp'][1], n_samples),
        'humidity': np.random.uniform(ranges['hum'][0], ranges['hum'][1], n_samples),
        'ph': np.random.uniform(ranges['ph'][0], ranges['ph'][1], n_samples),
        'rainfall': np.random.uniform(ranges['rain'][0], ranges['rain'][1], n_samples),
        'label': [crop_name] * n_samples
    }
    return pd.DataFrame(data)

# Step 3: Define ranges for added Ethiopian crops (based on agronomy sources)
# Sources: Ethiopian MoA guides, FAO crop ecology, soil fertility studies (pH/acidity common issue), highland/lowland adaptations
crop_ranges = {
    'teff': {'N': (30,90), 'P':(10,50), 'K':(20,100), 'temp':(15,30), 'hum':(40,70), 'ph':(5.8,7.3), 'rain':(500,1200)},
    'sorghum': {'N': (20,80), 'P':(10,40), 'K':(20,80), 'temp':(20,35), 'hum':(30,60), 'ph':(5.5,7.5), 'rain':(400,1000)},
    'barley': {'N': (40,120), 'P':(20,60), 'K':(30,120), 'temp':(12,25), 'hum':(50,80), 'ph':(6.0,7.8), 'rain':(600,1400)},
    'enset': {'N': (50,130), 'P':(20,80), 'K':(80,180), 'temp':(15,25), 'hum':(60,90), 'ph':(5.6,7.3), 'rain':(1100,1500)},
    'wheat': {'N': (50,130), 'P':(20,70), 'K':(30,130), 'temp':(10,25), 'hum':(50,80), 'ph':(6.0,7.5), 'rain':(600,1400)},
    'faba_bean': {'N': (20,60), 'P':(30,80), 'K':(40,100), 'temp':(15,25), 'hum':(60,90), 'ph':(6.5,9.0), 'rain':(700,1200)},
    'haricot_bean': {'N': (20,70), 'P':(30,70), 'K':(30,90), 'temp':(18,30), 'hum':(50,80), 'ph':(5.5,7.0), 'rain':(600,1200)},
    'field_pea': {'N': (20,60), 'P':(20,60), 'K':(30,80), 'temp':(10,25), 'hum':(60,90), 'ph':(6.0,7.5), 'rain':(600,1000)},
    'sesame': {'N': (40,100), 'P':(20,50), 'K':(20,60), 'temp':(25,35), 'hum':(40,70), 'ph':(5.5,8.0), 'rain':(500,800)},
    'noug': {'N': (30,90), 'P':(10,50), 'K':(20,80), 'temp':(15,25), 'hum':(50,80), 'ph':(5.5,7.5), 'rain':(800,1200)},
    'linseed': {'N': (40,100), 'P':(20,60), 'K':(30,100), 'temp':(10,25), 'hum':(50,80), 'ph':(6.0,7.5), 'rain':(700,1300)},
    'groundnut': {'N': (20,80), 'P':(30,70), 'K':(20,60), 'temp':(25,35), 'hum':(50,80), 'ph':(5.5,7.0), 'rain':(500,1000)},
    'potato': {'N': (80,150), 'P':(40,100), 'K':(80,150), 'temp':(15,20), 'hum':(60,80), 'ph':(5.0,6.5), 'rain':(750,1200)},
    'sweet_potato': {'N': (50,120), 'P':(30,80), 'K':(50,120), 'temp':(20,30), 'hum':(60,90), 'ph':(5.5,6.5), 'rain':(750,1500)},
    'cassava': {'N': (60,140), 'P':(30,80), 'K':(60,150), 'temp':(25,35), 'hum':(70,95), 'ph':(4.5,8.0), 'rain':(1000,2000)},
    'khat': {'N': (50,120), 'P':(20,70), 'K':(50,120), 'temp':(15,30), 'hum':(60,85), 'ph':(5.0,7.5), 'rain':(800,1500)},
    'sugarcane': {'N': (80,160), 'P':(40,100), 'K':(80,180), 'temp':(25,35), 'hum':(70,95), 'ph':(6.0,8.0), 'rain':(1200,2500)}
}

# Generate for each new crop
new_dfs = []
for crop, ranges in crop_ranges.items():
    new_df = generate_crop_data(crop, n_samples=150, ranges=ranges)
    new_dfs.append(new_df)

# Combine kept + new
df_ethiopia = pd.concat([df_kept] + new_dfs, ignore_index=True)

# Round to match original format
df_ethiopia[['N', 'P', 'K']] = df_ethiopia[['N', 'P', 'K']].round(0).astype(int)
df_ethiopia[['temperature', 'humidity', 'ph', 'rainfall']] = df_ethiopia[['temperature', 'humidity', 'ph', 'rainfall']].round(2)

# Save
df_ethiopia.to_csv('../data/Crop_recommendation_ethiopia.csv', index=False)

# Final checks
print("Final shape:", df_ethiopia.shape)
print("\nFinal class distribution:\n", df_ethiopia['label'].value_counts())

Kept rows from original: (1000, 8)
Kept class distribution:
 label
rice          100
maize         100
chickpea      100
lentil        100
banana        100
mango         100
watermelon    100
orange        100
cotton        100
coffee        100
Name: count, dtype: int64
Final shape: (3550, 8)

Final class distribution:
 label
teff            150
linseed         150
noug            150
sesame          150
field_pea       150
haricot_bean    150
faba_bean       150
wheat           150
enset           150
barley          150
sorghum         150
sweet_potato    150
cassava         150
groundnut       150
potato          150
khat            150
sugarcane       150
coffee          100
cotton          100
rice            100
maize           100
orange          100
watermelon      100
banana          100
mango           100
lentil          100
chickpea        100
Name: count, dtype: int64
