# Soil Health Assessment Model Development

This notebook demonstrates the development of machine learning models to predict:
1. Soil fertility level (High/Medium/Low)
2. Suitable crops
3. Soil quality score (0-100)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
from sklearn.multioutput import MultiOutputClassifier

# Set random seed for reproducibility
np.random.seed(42)

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('../data/soil_dataset.csv')

# Display the first few rows
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Statistical summary
df.describe()

In [None]:
# Distribution of fertility levels
plt.figure(figsize=(10, 6))
sns.countplot(x='fertility_level', data=df)
plt.title('Distribution of Soil Fertility Levels')
plt.show()

In [None]:
# Distribution of quality scores
plt.figure(figsize=(10, 6))
sns.histplot(df['quality_score'], bins=20, kde=True)
plt.title('Distribution of Soil Quality Scores')
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Soil Parameters')
plt.show()

## 2. Prepare Data for Modeling

In [None]:
# Encode the fertility level
le = LabelEncoder()
df['fertility_level_encoded'] = le.fit_transform(df['fertility_level'])
fertility_classes = le.classes_
print(f"Fertility level classes: {fertility_classes}")

In [None]:
# Extract features and targets
features = ['nitrogen', 'phosphorus', 'potassium', 'ph', 'ec', 'moisture', 'organic_matter']
X = df[features]

# Target for fertility level classification
y_fertility = df['fertility_level_encoded']

# Target for quality score regression
y_quality = df['quality_score']

# Split the data into training and testing sets
X_train, X_test, y_fertility_train, y_fertility_test, y_quality_train, y_quality_test = train_test_split(
    X, y_fertility, y_quality, test_size=0.2, random_state=42
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 3. Model for Soil Fertility Level Classification

In [None]:
# Train a Random Forest classifier for fertility level
rf_fertility = RandomForestClassifier(n_estimators=100, random_state=42)
rf_fertility.fit(X_train_scaled, y_fertility_train)

# Make predictions
y_fertility_pred = rf_fertility.predict(X_test_scaled)

# Evaluate the model
fertility_accuracy = accuracy_score(y_fertility_test, y_fertility_pred)
print(f"Fertility Level Classification Accuracy: {fertility_accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_fertility_test, y_fertility_pred, target_names=fertility_classes))

In [None]:
# Feature importance for fertility classification
plt.figure(figsize=(10, 6))
importances = rf_fertility.feature_importances_
indices = np.argsort(importances)[::-1]

plt.title('Feature Importance for Soil Fertility Classification')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.show()

## 4. Model for Soil Quality Score Prediction

In [None]:
# Train a Random Forest regressor for quality score
rf_quality = RandomForestRegressor(n_estimators=100, random_state=42)
rf_quality.fit(X_train_scaled, y_quality_train)

# Make predictions
y_quality_pred = rf_quality.predict(X_test_scaled)

# Evaluate the model
quality_mse = mean_squared_error(y_quality_test, y_quality_pred)
quality_rmse = np.sqrt(quality_mse)
quality_r2 = r2_score(y_quality_test, y_quality_pred)

print(f"Quality Score Prediction MSE: {quality_mse:.4f}")
print(f"Quality Score Prediction RMSE: {quality_rmse:.4f}")
print(f"Quality Score Prediction R²: {quality_r2:.4f}")

In [None]:
# Plot actual vs predicted quality scores
plt.figure(figsize=(10, 6))
plt.scatter(y_quality_test, y_quality_pred, alpha=0.5)
plt.plot([y_quality_test.min(), y_quality_test.max()], [y_quality_test.min(), y_quality_test.max()], 'r--')
plt.xlabel('Actual Quality Score')
plt.ylabel('Predicted Quality Score')
plt.title('Actual vs Predicted Soil Quality Scores')
plt.show()

In [None]:
# Feature importance for quality score prediction
plt.figure(figsize=(10, 6))
importances = rf_quality.feature_importances_
indices = np.argsort(importances)[::-1]

plt.title('Feature Importance for Soil Quality Score Prediction')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]), [features[i] for i in indices], rotation=90)
plt.tight_layout()
plt.show()

## 5. Crop Suitability Model

For crop suitability, we'll use a rule-based approach based on soil parameters.

In [None]:
def predict_suitable_crops(soil_params):
    """Predict suitable crops based on soil parameters."""
    nitrogen, phosphorus, potassium, ph, ec, moisture, organic_matter = soil_params
    
    suitable_crops = []
    
    # Rice
    if 5.0 <= ph <= 7.5 and moisture > 60:
        suitable_crops.append('Rice')
    
    # Wheat
    if 6.0 <= ph <= 7.5 and nitrogen > 40:
        suitable_crops.append('Wheat')
    
    # Corn
    if 5.5 <= ph <= 7.5 and potassium > 50:
        suitable_crops.append('Corn')
    
    # Soybeans
    if 6.0 <= ph <= 7.0 and phosphorus > 40:
        suitable_crops.append('Soybeans')
    
    # Cotton
    if 5.5 <= ph <= 8.0 and ec < 3:
        suitable_crops.append('Cotton')
    
    # Tomatoes
    if 5.5 <= ph <= 7.5 and organic_matter > 3:
        suitable_crops.append('Tomatoes')
    
    # Potatoes
    if 4.8 <= ph <= 6.5 and moisture > 40:
        suitable_crops.append('Potatoes')
    
    # Carrots
    if 5.5 <= ph <= 7.0 and phosphorus > 30:
        suitable_crops.append('Carrots')
    
    # Lettuce
    if 6.0 <= ph <= 7.0 and nitrogen > 30:
        suitable_crops.append('Lettuce')
    
    # Spinach
    if 6.0 <= ph <= 7.5 and nitrogen > 50:
        suitable_crops.append('Spinach')
    
    return suitable_crops

In [None]:
# Test the crop suitability function
sample_params = X_test.iloc[0].values
print(f"Sample soil parameters: {sample_params}")
suitable_crops = predict_suitable_crops(sample_params)
print(f"Suitable crops: {suitable_crops}")

## 6. Save the Models

In [None]:
import pickle

# Save the models
with open('../models/fertility_model.pkl', 'wb') as f:
    pickle.dump(rf_fertility, f)

with open('../models/quality_model.pkl', 'wb') as f:
    pickle.dump(rf_quality, f)

# Save the scaler
with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the label encoder
with open('../models/label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

print("Models saved successfully!")

## 7. Model Testing with Sample Input

In [None]:
def predict_soil_health(nitrogen, phosphorus, potassium, ph, ec, moisture, organic_matter):
    """Make predictions for soil health based on input parameters."""
    # Prepare input data
    input_data = np.array([[nitrogen, phosphorus, potassium, ph, ec, moisture, organic_matter]])
    
    # Scale the input data
    input_scaled = scaler.transform(input_data)
    
    # Predict fertility level
    fertility_pred = rf_fertility.predict(input_scaled)[0]
    fertility_level = le.inverse_transform([fertility_pred])[0]
    
    # Predict quality score
    quality_score = rf_quality.predict(input_scaled)[0]
    
    # Predict suitable crops
    suitable_crops = predict_suitable_crops([nitrogen, phosphorus, potassium, ph, ec, moisture, organic_matter])
    
    return {
        'fertility_level': fertility_level,
        'quality_score': round(quality_score, 1),
        'suitable_crops': suitable_crops
    }

In [None]:
# Test with sample inputs
test_cases = [
    # High fertility soil
    {'nitrogen': 80, 'phosphorus': 75, 'potassium': 85, 'ph': 6.5, 'ec': 1.2, 'moisture': 60, 'organic_matter': 7.5},
    # Medium fertility soil
    {'nitrogen': 45, 'phosphorus': 40, 'potassium': 50, 'ph': 7.0, 'ec': 2.0, 'moisture': 45, 'organic_matter': 4.0},
    # Low fertility soil
    {'nitrogen': 15, 'phosphorus': 10, 'potassium': 20, 'ph': 5.0, 'ec': 3.5, 'moisture': 30, 'organic_matter': 1.5}
]

for i, case in enumerate(test_cases):
    print(f"\nTest Case {i+1}:")
    print(f"Input: {case}")
    result = predict_soil_health(
        case['nitrogen'], case['phosphorus'], case['potassium'], 
        case['ph'], case['ec'], case['moisture'], case['organic_matter']
    )
    print(f"Predicted Fertility Level: {result['fertility_level']}")
    print(f"Predicted Quality Score: {result['quality_score']}")
    print(f"Suitable Crops: {', '.join(result['suitable_crops'])}")