# Fertilizer Recommendation - Exploratory Data Analysis

This notebook performs exploratory data analysis on the fertilizer recommendation dataset. The goal is to predict the appropriate fertilizer based on soil conditions, weather, and crop requirements.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data

In [None]:
# Load training data
df = pd.read_csv('../data/train.csv')
print(f"Dataset shape: {df.shape}")
df.head(10)

## 3. Basic Data Information

In [None]:
# Data info
print("Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("\nStatistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("\nMissing Values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values")

## 4. Target Variable Analysis

In [None]:
# Fertilizer distribution
print("Fertilizer Distribution:")
print(df['Fertilizer Name'].value_counts())

# Visualize distribution
plt.figure(figsize=(12, 6))
df['Fertilizer Name'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of Fertilizer Types', fontsize=16, fontweight='bold')
plt.xlabel('Fertilizer Name', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Numerical Features Analysis

In [None]:
# Numerical features
numerical_features = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df[col], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Box plots to check for outliers
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].boxplot(df[col])
    axes[idx].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)

plt.tight_layout()
plt.show()

## 6. Categorical Features Analysis

In [None]:
# Soil Type distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Soil Type
df['Soil Type'].value_counts().plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Distribution of Soil Types', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Soil Type', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)

# Crop Type
df['Crop Type'].value_counts().plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Distribution of Crop Types', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Crop Type', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 8. Feature Analysis by Fertilizer Type

In [None]:
# NPK values by fertilizer type
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, nutrient in enumerate(['Nitrogen', 'Phosphorous', 'Potassium']):
    df.groupby('Fertilizer Name')[nutrient].mean().sort_values().plot(
        kind='barh', ax=axes[idx], color='steelblue'
    )
    axes[idx].set_title(f'Average {nutrient} by Fertilizer', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(f'{nutrient} Level', fontsize=10)
    axes[idx].set_ylabel('Fertilizer Name', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Environmental conditions by fertilizer
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(['Temperature', 'Humidity', 'Moisture']):
    df.groupby('Fertilizer Name')[feature].mean().sort_values().plot(
        kind='barh', ax=axes[idx], color='coral'
    )
    axes[idx].set_title(f'Average {feature} by Fertilizer', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(f'{feature}', fontsize=10)
    axes[idx].set_ylabel('Fertilizer Name', fontsize=10)

plt.tight_layout()
plt.show()

## 9. Pair Plot for Key Features

In [None]:
# Select key features for pair plot
key_features = ['Nitrogen', 'Phosphorous', 'Potassium', 'Temperature', 'Fertilizer Name']
sns.pairplot(df[key_features], hue='Fertilizer Name', diag_kind='kde', 
             palette='Set2', plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot of Key Features', y=1.02, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 10. Relationship between Soil Type, Crop Type and Fertilizer

In [None]:
# Soil Type vs Fertilizer
soil_fert = pd.crosstab(df['Soil Type'], df['Fertilizer Name'])
plt.figure(figsize=(14, 6))
soil_fert.plot(kind='bar', stacked=False, figsize=(14, 6))
plt.title('Fertilizer Usage by Soil Type', fontsize=16, fontweight='bold')
plt.xlabel('Soil Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Fertilizer', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Crop Type vs Fertilizer
crop_fert = pd.crosstab(df['Crop Type'], df['Fertilizer Name'])
plt.figure(figsize=(14, 6))
crop_fert.plot(kind='bar', stacked=False, figsize=(14, 6))
plt.title('Fertilizer Usage by Crop Type', fontsize=16, fontweight='bold')
plt.xlabel('Crop Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Fertilizer', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 11. Summary Statistics by Fertilizer Type

In [None]:
# Group statistics by fertilizer
print("Average values by Fertilizer Type:")
grouped_stats = df.groupby('Fertilizer Name')[numerical_features].mean()
print(grouped_stats.round(2))

## 12. Classification Algorithm Comparison

In this section, we compare different classification algorithms to determine which performs best for fertilizer recommendation. We also evaluate the ability to predict top-3 fertilizers for each sample.

### 12.1 Data Preparation

In [None]:
# Prepare data for modeling
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import time

# Encode categorical variables
le_soil = LabelEncoder()
le_crop = LabelEncoder()
le_fertilizer = LabelEncoder()

df_model = df.copy()
df_model['Soil Type Encoded'] = le_soil.fit_transform(df_model['Soil Type'])
df_model['Crop Type Encoded'] = le_crop.fit_transform(df_model['Crop Type'])
df_model['Fertilizer Encoded'] = le_fertilizer.fit_transform(df_model['Fertilizer Name'])

# Prepare features and target
feature_cols = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 
                'Potassium', 'Phosphorous', 'Soil Type Encoded', 'Crop Type Encoded']
X = df_model[feature_cols]
y = df_model['Fertilizer Encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Number of classes: {len(le_fertilizer.classes_)}")
print(f"\nFertilizer classes: {le_fertilizer.classes_.tolist()}")

### 12.2 Train and Evaluate Multiple Algorithms

In [None]:
# Initialize classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1),
    'Decision Tree': DecisionTreeClassifier(max_depth=20, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, n_jobs=-1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
}

# Train and evaluate each classifier
results = {}

for name, clf in classifiers.items():
    print(f"\n{'='*60}")
    print(f"Training {name}...")
    print('='*60)
    
    # Train
    start_time = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - start_time
    
    # Predict
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'model': clf,
        'accuracy': accuracy,
        'train_time': train_time,
        'predictions': y_pred
    }
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Training time: {train_time:.2f} seconds")

print("\n" + "="*60)
print("Training completed for all algorithms!")
print("="*60)

### 12.3 Accuracy Comparison

In [None]:
# Compare accuracies
accuracy_comparison = pd.DataFrame({
    'Algorithm': list(results.keys()),
    'Accuracy': [results[name]['accuracy'] for name in results.keys()],
    'Training Time (s)': [results[name]['train_time'] for name in results.keys()]
})

accuracy_comparison = accuracy_comparison.sort_values('Accuracy', ascending=False)
print("\nAlgorithm Performance Comparison:")
print(accuracy_comparison.to_string(index=False))

# Visualize accuracy comparison
plt.figure(figsize=(12, 6))
plt.barh(accuracy_comparison['Algorithm'], accuracy_comparison['Accuracy'], color='skyblue', edgecolor='navy')
plt.xlabel('Accuracy', fontsize=12)
plt.ylabel('Algorithm', fontsize=12)
plt.title('Classification Algorithm Accuracy Comparison', fontsize=14, fontweight='bold')
plt.xlim([0.8, 1.0])
for i, v in enumerate(accuracy_comparison['Accuracy']):
    plt.text(v + 0.005, i, f'{v:.4f}', va='center')
plt.tight_layout()
plt.show()

### 12.4 Detailed Classification Report for Best Model

In [None]:
# Get best model
best_model_name = accuracy_comparison.iloc[0]['Algorithm']
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

print(f"\n{'='*60}")
print(f"Detailed Report for Best Model: {best_model_name}")
print('='*60)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, best_predictions, target_names=le_fertilizer.classes_))

### 12.5 Confusion Matrix

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, best_predictions)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_fertilizer.classes_, 
            yticklabels=le_fertilizer.classes_,
            cbar_kws={'label': 'Count'})
plt.xlabel('Predicted Fertilizer', fontsize=12)
plt.ylabel('Actual Fertilizer', fontsize=12)
plt.title(f'Confusion Matrix - {best_model_name}', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

### 12.6 Top-3 Prediction Analysis

Since the model may need to predict up to 3 fertilizer recommendations, let's evaluate how well each model performs at providing top-3 predictions.

In [None]:
# Calculate top-k accuracy for models that support predict_proba
def calculate_topk_accuracy(model, X_test, y_test, k=3):
    """Calculate top-k accuracy"""
    if not hasattr(model, 'predict_proba'):
        return None
    
    # Get probability predictions
    proba = model.predict_proba(X_test)
    
    # Get top-k predictions for each sample
    top_k_predictions = np.argsort(proba, axis=1)[:, -k:]
    
    # Check if true label is in top-k predictions
    correct = 0
    for i, true_label in enumerate(y_test):
        if true_label in top_k_predictions[i]:
            correct += 1
    
    return correct / len(y_test)

# Calculate top-1, top-2, and top-3 accuracy for each model
topk_results = []

for name, result in results.items():
    model = result['model']
    top1_acc = result['accuracy']  # This is same as top-1
    top2_acc = calculate_topk_accuracy(model, X_test, y_test, k=2)
    top3_acc = calculate_topk_accuracy(model, X_test, y_test, k=3)
    
    if top2_acc is not None:
        topk_results.append({
            'Algorithm': name,
            'Top-1 Accuracy': top1_acc,
            'Top-2 Accuracy': top2_acc,
            'Top-3 Accuracy': top3_acc
        })

topk_df = pd.DataFrame(topk_results)
topk_df = topk_df.sort_values('Top-3 Accuracy', ascending=False)

print("\nTop-K Accuracy Comparison:")
print(topk_df.to_string(index=False))

In [None]:
# Visualize top-k accuracy
fig, ax = plt.subplots(figsize=(14, 7))

x = np.arange(len(topk_df))
width = 0.25

ax.bar(x - width, topk_df['Top-1 Accuracy'], width, label='Top-1', color='steelblue')
ax.bar(x, topk_df['Top-2 Accuracy'], width, label='Top-2', color='lightcoral')
ax.bar(x + width, topk_df['Top-3 Accuracy'], width, label='Top-3', color='lightgreen')

ax.set_xlabel('Algorithm', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Top-K Accuracy Comparison Across Algorithms', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(topk_df['Algorithm'], rotation=45, ha='right')
ax.legend()
ax.set_ylim([0.8, 1.0])
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

### 12.7 Example: Top-3 Predictions for Sample Data

In [None]:
# Show top-3 predictions for first few test samples
def get_top3_predictions(model, X_sample, le_fertilizer):
    """Get top-3 fertilizer predictions with probabilities"""
    if not hasattr(model, 'predict_proba'):
        return None
    
    proba = model.predict_proba(X_sample)
    top3_indices = np.argsort(proba, axis=1)[:, -3:][:, ::-1]  # Get top 3 in descending order
    
    results = []
    for i, indices in enumerate(top3_indices):
        predictions = []
        for idx in indices:
            fertilizer_name = le_fertilizer.classes_[idx]
            probability = proba[i, idx]
            predictions.append((fertilizer_name, probability))
        results.append(predictions)
    
    return results

# Get predictions for first 5 test samples
sample_size = 5
X_sample = X_test.iloc[:sample_size]
y_sample = y_test.iloc[:sample_size]

print(f"\nTop-3 Predictions for {sample_size} Sample Test Cases using {best_model_name}:")
print("="*80)

top3_preds = get_top3_predictions(best_model, X_sample, le_fertilizer)

for i in range(sample_size):
    actual_fertilizer = le_fertilizer.classes_[y_sample.iloc[i]]
    print(f"\nSample {i+1}:")
    print(f"  Actual Fertilizer: {actual_fertilizer}")
    print(f"  Top-3 Predictions:")
    for rank, (fert_name, prob) in enumerate(top3_preds[i], 1):
        marker = "✓" if fert_name == actual_fertilizer else " "
        print(f"    {rank}. {fert_name:30s} (probability: {prob:.4f}) {marker}")

### 12.8 Model Comparison Summary

In [None]:
# Create comprehensive comparison table
comparison_summary = accuracy_comparison.merge(topk_df, on='Algorithm', how='left')
comparison_summary = comparison_summary.sort_values('Accuracy', ascending=False)

print("\n" + "="*100)
print("COMPREHENSIVE MODEL COMPARISON SUMMARY")
print("="*100)
print(comparison_summary.to_string(index=False))

print("\n" + "="*100)
print("KEY FINDINGS:")
print("="*100)
best_algo = comparison_summary.iloc[0]
print(f"\n1. Best Overall Algorithm: {best_algo['Algorithm']}")
print(f"   - Accuracy: {best_algo['Accuracy']:.4f}")
print(f"   - Top-3 Accuracy: {best_algo['Top-3 Accuracy']:.4f}" if pd.notna(best_algo.get('Top-3 Accuracy')) else "   - Top-3 Accuracy: N/A")
print(f"   - Training Time: {best_algo['Training Time (s)']:.2f} seconds")

fastest_algo = comparison_summary.loc[comparison_summary['Training Time (s)'].idxmin()]
print(f"\n2. Fastest Algorithm: {fastest_algo['Algorithm']}")
print(f"   - Training Time: {fastest_algo['Training Time (s)']:.2f} seconds")
print(f"   - Accuracy: {fastest_algo['Accuracy']:.4f}")

print("\n3. Top-3 Prediction Capability:")
print("   All probability-based models support providing up to 3 fertilizer recommendations")
print("   with associated confidence scores, which is valuable for giving users options.")

print("\n4. Recommendation:")
if best_algo['Accuracy'] > 0.95:
    print(f"   Use {best_algo['Algorithm']} for production deployment due to its excellent accuracy.")
else:
    print(f"   Consider ensemble methods or feature engineering to improve performance.")

## 13. Key Insights

Based on the exploratory data analysis:

1. **Dataset Balance**: The dataset is well-balanced across different fertilizer types
2. **NPK Ratios**: Different fertilizers show distinct NPK (Nitrogen-Phosphorous-Potassium) patterns
3. **Environmental Factors**: Temperature and humidity ranges vary by fertilizer type
4. **Soil & Crop Types**: Fertilizer recommendations are relatively uniform across soil and crop types
5. **Feature Importance**: NPK values appear to be the most distinguishing features for fertilizer classification

## Next Steps

1. Feature engineering (if needed)
2. Model development and training
3. Model evaluation and optimization
4. Deployment with Docker