In [2]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
import joblib
import os


In [5]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib  # For saving preprocessors (useful for FastAPI deployment later)
import os

# Create processed data folder
os.makedirs('../data/processed', exist_ok=True)

df = pd.read_csv('../data/Crop_recommendation.csv')

print("Original shape:", df.shape)
print("\nClass distribution:\n", df['label'].value_counts())

# Separate features and target
X = df.drop('label', axis=1)
y = df['label']

# Encode target labels (crop names â†’ integers)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Save the label encoder for deployment (to decode predictions back to crop names)
joblib.dump(le, '../models/label_encoder.pkl')

print("Encoded classes example:", dict(zip(le.classes_, le.transform(le.classes_))))

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler for deployment
joblib.dump(scaler, '../models/scaler.pkl')

# Convert back to DataFrame for inspection
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("\nScaled features sample:")
print(X_scaled_df.head())
print("\nScaled stats:")
print(X_scaled_df.describe().round(2))

# Train/validation/test split (70/15/15)
# First split train + (val+test)
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

# Then split temp into val and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("\nSplit sizes:")
print(f"Train: {X_train.shape[0]} samples")
print(f"Validation: {X_val.shape[0]} samples")
print(f"Test: {X_test.shape[0]} samples")

pd.DataFrame(X_train, columns=X.columns).to_csv('../data/processed/X_train.csv', index=False)
pd.DataFrame(X_val, columns=X.columns).to_csv('../data/processed/X_val.csv', index=False)
pd.DataFrame(X_test, columns=X.columns).to_csv('../data/processed/X_test.csv', index=False)
pd.DataFrame(y_train, columns=['label_encoded']).to_csv('../data/processed/y_train.csv', index=False)
pd.DataFrame(y_val, columns=['label_encoded']).to_csv('../data/processed/y_val.csv', index=False)
pd.DataFrame(y_test, columns=['label_encoded']).to_csv('../data/processed/y_test.csv', index=False)

# save original y (crop names) for val/test for analysis
pd.DataFrame(y_train_original := le.inverse_transform(y_train), columns=['crop']).to_csv('../data/processed/y_train_names.csv', index=False)


Original shape: (2200, 8)

Class distribution:
 label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64
Encoded classes example: {'apple': np.int64(0), 'banana': np.int64(1), 'blackgram': np.int64(2), 'chickpea': np.int64(3), 'coconut': np.int64(4), 'coffee': np.int64(5), 'cotton': np.int64(6), 'grapes': np.int64(7), 'jute': np.int64(8), 'kidneybeans': np.int64(9), 'lentil': np.int64(10), 'maize': np.int64(11), 'mango': np.int64(12), 'mothbeans': np.int64(13), 'mungbean': np.int64(14), 'muskmelon': np.int64(15), 'orange': np.int64(16), 'papaya': np.int64(17), 'pigeonpeas': np.int64(18), 'pomegranate