In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE

# Load dataset
dataset_path = 'updated_soil_fertility.csv'
df = pd.read_csv(dataset_path)

# Selecting required features
features = ['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH']
X = df[features]

# Encoding target variable (Recommendation)
label_enc_recommendation = LabelEncoder()
df['Recommendation'] = label_enc_recommendation.fit_transform(df['Recommendation'])

# Ensure "Leguminous" exists in LabelEncoder
if "Leguminous" not in label_enc_recommendation.classes_:
    label_enc_recommendation.classes_ = np.append(label_enc_recommendation.classes_, "Leguminous")

# Encode 'Type' column
df['Type'] = df['Type'].astype(str)
type_mapping = {type_name: idx for idx, type_name in enumerate(df['Type'].unique())}
inverse_type_mapping = {v: k for k, v in type_mapping.items()}
df['Type'] = df['Type'].map(type_mapping)

# Define target variables
y_recommendation = df['Recommendation']
y_type = df['Type']

# Feature selection using mutual information
selector = SelectKBest(mutual_info_classif, k='all')
X_new = selector.fit_transform(X, y_recommendation)

# Train-Test Split
X_train, X_test, y_train_rec, y_test_rec = train_test_split(
    X_new, y_recommendation, test_size=0.2, random_state=42, stratify=y_recommendation
)

# Handle class imbalance if necessary
if len(np.unique(y_train_rec)) > 1:
    smote = SMOTE(random_state=42)
    X_train, y_train_rec = smote.fit_resample(X_train, y_train_rec)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Hyperparameter tuning for RandomForest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 4, 6]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train_rec)

# Save best model
best_rf_model = grid_search.best_estimator_
joblib.dump(best_rf_model, 'soil_recommendation_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Save label encoder for Recommendation
joblib.dump(label_enc_recommendation, 'label_encoder_recommendation.pkl')

# Save Type mappings
joblib.dump(type_mapping, 'type_mapping.pkl')
joblib.dump(inverse_type_mapping, 'inverse_type_mapping.pkl')

print("Model and preprocessing objects saved successfully!")


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Model and preprocessing objects saved successfully!


In [4]:
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE

# Load dataset
dataset_path = 'updated_soil_fertility.csv'
df = pd.read_csv(dataset_path)

# Features and target selection
features = ['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH']
X = df[features]

# Encode Recommendation (Leguminous/General)
label_enc_recommendation = LabelEncoder()
df['Recommendation'] = label_enc_recommendation.fit_transform(df['Recommendation'])

# Encode Type (Specific Plant Name)
df['Type'] = df['Type'].astype(str)
type_mapping = {name: idx for idx, name in enumerate(df['Type'].unique())}
inverse_type_mapping = {v: k for k, v in type_mapping.items()}
df['Type'] = df['Type'].map(type_mapping)

# Target Variables
y_recommendation = df['Recommendation']
y_type = df['Type']

# Feature selection
selector = SelectKBest(f_classif, k='all')
X_new = selector.fit_transform(X, y_recommendation)

# Split data
X_train, X_test, y_train_rec, y_test_rec = train_test_split(X_new, y_recommendation, test_size=0.2, random_state=42, stratify=y_recommendation)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train, y_train_rec = smote.fit_resample(X_train, y_train_rec)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the category model (Leguminous/General)
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=4, random_state=42)
rf_model.fit(X_train, y_train_rec)

# Ensure models directory exists
model_dir = "saved_models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Save category model & preprocessing tools
joblib.dump(rf_model, os.path.join(model_dir, "plant_category_model.pkl"))
joblib.dump(scaler, os.path.join(model_dir, "scaler.pkl"))
joblib.dump(label_enc_recommendation, os.path.join(model_dir, "label_encoder.pkl"))

# Train the type model (Specific plant type)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(X_new, y_type, test_size=0.2, random_state=42, stratify=y_type)
type_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
type_model.fit(X_train_type, y_train_type)

# Save type model & mappings
joblib.dump(type_model, os.path.join(model_dir, "plant_type_model.pkl"))
joblib.dump(type_mapping, os.path.join(model_dir, "type_mapping.pkl"))
joblib.dump(inverse_type_mapping, os.path.join(model_dir, "inverse_type_mapping.pkl"))

print("Models saved successfully in 'saved_models' folder!")

# Verify files exist
print("Saved files:")
for file in os.listdir(model_dir):
    print(f" - {file}")


Models saved successfully in 'saved_models' folder!
Saved files:
 - inverse_type_mapping.pkl
 - label_encoder.pkl
 - plant_category_model.pkl
 - plant_type_model.pkl
 - scaler.pkl
 - type_mapping.pkl
