# Fertilizer Recommendation - Exploratory Data Analysis

This notebook performs exploratory data analysis on the fertilizer recommendation dataset. The goal is to predict the appropriate fertilizer based on soil conditions, weather, and crop requirements.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Data

In [None]:
# Load training data
df = pd.read_csv('../data/train.csv')
print(f"Dataset shape: {df.shape}")
df.head(10)

## 3. Basic Data Information

In [None]:
# Data info
print("Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("\nStatistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("\nMissing Values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values")

## 4. Target Variable Analysis

In [None]:
# Fertilizer distribution
print("Fertilizer Distribution:")
print(df['Fertilizer Name'].value_counts())

# Visualize distribution
plt.figure(figsize=(12, 6))
df['Fertilizer Name'].value_counts().plot(kind='bar', color='steelblue')
plt.title('Distribution of Fertilizer Types', fontsize=16, fontweight='bold')
plt.xlabel('Fertilizer Name', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 5. Numerical Features Analysis

In [None]:
# Numerical features
numerical_features = ['Temperature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']

# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].hist(df[col], bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Box plots to check for outliers
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_features):
    axes[idx].boxplot(df[col])
    axes[idx].set_title(f'Box Plot of {col}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(col, fontsize=10)

plt.tight_layout()
plt.show()

## 6. Categorical Features Analysis

In [None]:
# Soil Type distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Soil Type
df['Soil Type'].value_counts().plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Distribution of Soil Types', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Soil Type', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)

# Crop Type
df['Crop Type'].value_counts().plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Distribution of Crop Types', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Crop Type', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Numerical Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 8. Feature Analysis by Fertilizer Type

In [None]:
# NPK values by fertilizer type
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, nutrient in enumerate(['Nitrogen', 'Phosphorous', 'Potassium']):
    df.groupby('Fertilizer Name')[nutrient].mean().sort_values().plot(
        kind='barh', ax=axes[idx], color='steelblue'
    )
    axes[idx].set_title(f'Average {nutrient} by Fertilizer', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(f'{nutrient} Level', fontsize=10)
    axes[idx].set_ylabel('Fertilizer Name', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Environmental conditions by fertilizer
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, feature in enumerate(['Temperature', 'Humidity', 'Moisture']):
    df.groupby('Fertilizer Name')[feature].mean().sort_values().plot(
        kind='barh', ax=axes[idx], color='coral'
    )
    axes[idx].set_title(f'Average {feature} by Fertilizer', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(f'{feature}', fontsize=10)
    axes[idx].set_ylabel('Fertilizer Name', fontsize=10)

plt.tight_layout()
plt.show()

## 9. Pair Plot for Key Features

In [None]:
# Select key features for pair plot
key_features = ['Nitrogen', 'Phosphorous', 'Potassium', 'Temperature', 'Fertilizer Name']
sns.pairplot(df[key_features], hue='Fertilizer Name', diag_kind='kde', 
             palette='Set2', plot_kws={'alpha': 0.6})
plt.suptitle('Pair Plot of Key Features', y=1.02, fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 10. Relationship between Soil Type, Crop Type and Fertilizer

In [None]:
# Soil Type vs Fertilizer
soil_fert = pd.crosstab(df['Soil Type'], df['Fertilizer Name'])
plt.figure(figsize=(14, 6))
soil_fert.plot(kind='bar', stacked=False, figsize=(14, 6))
plt.title('Fertilizer Usage by Soil Type', fontsize=16, fontweight='bold')
plt.xlabel('Soil Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Fertilizer', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Crop Type vs Fertilizer
crop_fert = pd.crosstab(df['Crop Type'], df['Fertilizer Name'])
plt.figure(figsize=(14, 6))
crop_fert.plot(kind='bar', stacked=False, figsize=(14, 6))
plt.title('Fertilizer Usage by Crop Type', fontsize=16, fontweight='bold')
plt.xlabel('Crop Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.legend(title='Fertilizer', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 11. Summary Statistics by Fertilizer Type

In [None]:
# Group statistics by fertilizer
print("Average values by Fertilizer Type:")
grouped_stats = df.groupby('Fertilizer Name')[numerical_features].mean()
print(grouped_stats.round(2))

## 12. Key Insights

Based on the exploratory data analysis:

1. **Dataset Balance**: The dataset is well-balanced across different fertilizer types
2. **NPK Ratios**: Different fertilizers show distinct NPK (Nitrogen-Phosphorous-Potassium) patterns
3. **Environmental Factors**: Temperature and humidity ranges vary by fertilizer type
4. **Soil & Crop Types**: Fertilizer recommendations are relatively uniform across soil and crop types
5. **Feature Importance**: NPK values appear to be the most distinguishing features for fertilizer classification

## Next Steps

1. Feature engineering (if needed)
2. Model development and training
3. Model evaluation and optimization
4. Deployment with Docker