In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree, ensemble
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the data
rawData = pd.read_csv('soil_health_monitoring_system.csv')
rawData.head()


Unnamed: 0,Sample ID,State,City,Organic Matter (%),Nitrogen (ppm),Phosphorus (ppm),Potassium (ppm),pH Level,Microbial Activity (CFU/g),Soil Structure (1-5),Moisture Retention (%),Iron (ppm),Zinc (ppm),Temperature (°C),Soil Salinity (dS/m),Weather Condition,Soil Compaction (g/cm³),Soil Health
0,1,Rajasthan,Patna,2.6,39,29,241,7.3,886911.1,3,19.4,8.4,1.4,29,0.46,Rainy,1.22,Unhealthy
1,2,West Bengal,Chennai,2.0,35,15,195,6.7,1384623.2,2,14.1,8.2,1.2,25,0.44,Cloudy,1.13,Unhealthy
2,3,Bihar,Pune,4.8,23,17,202,6.9,1230697.0,2,14.7,7.5,1.0,29,0.27,Rainy,1.13,Healthy
3,4,Kerala,Jaipur,3.4,28,22,271,6.4,908546.2,3,14.6,8.2,0.8,29,0.31,Windy,1.1,Unhealthy
4,5,Punjab,Patna,2.1,28,29,256,6.3,903060.4,2,23.8,8.2,1.0,28,0.48,Cloudy,1.25,Unhealthy


In [6]:

rawData = rawData.drop(['State', 'City', 'Weather Condition'], axis=1)
# Prepare the data
data = rawData.copy()
data = data.drop('Soil Health', axis=1)

KeyError: "['State', 'City', 'Weather Condition'] not found in axis"

In [3]:
# Correlation heatmap
corr = data.corr()
f, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(corr, cmap=sns.color_palette("BuGn_r"), vmin=-1.0, vmax=1.0, square=True, ax=ax)
plt.title("Feature Correlation Heatmap")
plt.show()

NameError: name 'data' is not defined

In [None]:
# Feature distribution
features = data.copy()
features.hist(bins=50, figsize=(10, 10), color='green', grid=False)
plt.title("Feature Distributions")
plt.show()



In [None]:
# Log transformation
transformedFeatures = features.apply(lambda x: np.log10(x) if np.issubdtype(x.dtype, np.number) else x)
transformedFeatures.hist(bins=50, figsize=(10, 10), color='green', grid=False)
plt.title("Log Transformed Feature Distributions")
plt.show()

In [None]:

# Train-test split
labels = rawData[['Soil Health']]
trainInput, validationInput, trainTarget, validationTarget = train_test_split(transformedFeatures, labels, test_size=0.2, shuffle=True, random_state=42)
print("Train Data Shape: ", trainInput.shape)

trainTarget = trainTarget.values.ravel()

In [None]:
# Train classifiers
svcClf = SVC()
svcClf.fit(trainInput, trainTarget)

forestClf = ensemble.RandomForestClassifier()
forestClf.fit(trainInput, trainTarget)

nbClf = GaussianNB()
nbClf.fit(trainInput, trainTarget)

knnClf = KNeighborsClassifier()
knnClf.fit(trainInput, trainTarget)

treeClf = tree.DecisionTreeClassifier()
treeClf.fit(trainInput, trainTarget)

In [None]:
# Evaluate models
models = [svcClf, forestClf, nbClf, knnClf, treeClf]
accs = []
titles = []

for model in models:
    pred = model.predict(validationInput)
    model_acc = accuracy_score(validationTarget, pred)
    accs.append(model_acc)
    titles.append(type(model).__name__)
    print(type(model).__name__, " accuracy is ", model_acc)

In [None]:
# Bar plot for accuracy
fig = plt.figure(figsize=(10, 5))
sns.barplot(x=titles, y=accs)
plt.title("Model Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()

In [None]:
# Hyperparameter tuning for Random Forest
forestClassifier = ensemble.RandomForestClassifier(random_state=42)
paramGrid = { 
    'n_estimators': [200, 300, 500],
    'max_features': ['sqrt', 'log2'],  # Remove 'auto'
    'max_depth': [4, 5, 6, 7, 8, 9, 10],
    'criterion': ['gini', 'entropy']
}

clf = GridSearchCV(estimator=forestClassifier, param_grid=paramGrid, cv=5)
clf.fit(trainInput, trainTarget)

print("Best Parameters: ", clf.best_params_)

In [None]:
# Train final model with best parameters
randomForestModel = ensemble.RandomForestClassifier(criterion='gini', max_depth=10, max_features='sqrt', n_estimators=300, random_state=42)
randomForestModel.fit(trainInput, trainTarget)





In [None]:
# Predictions and evaluation
predictions = randomForestModel.predict(validationInput)
report = classification_report(validationTarget, predictions)
print(report)

mode_acc = accuracy_score(validationTarget, predictions)
print("Random Forest Model Accuracy: ", mode_acc)


In [None]:
# Confusion Matrix Visualization
cm = confusion_matrix(validationTarget, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()



In [None]:
# Feature Importance Visualization
importances = randomForestModel.feature_importances_
indices = np.argsort(importances)[::-1]
feature_names = features.columns

plt.figure(figsize=(10, 5))
plt.title("Feature Importances")
plt.bar(range(trainInput.shape[1]), importances[indices], align='center')
plt.xticks(range(trainInput.shape[1]), feature_names[indices], rotation=90)
plt.xlim([-1, trainInput.shape[1]])
plt.show()

In [None]:
# ROC Curve (if it's a binary classification problem)
from sklearn.metrics import roc_curve, auc

if len(np.unique(trainTarget)) == 2:  # Check if binary classification
    fpr, tpr, _ = roc_curve(validationTarget, randomForestModel.predict_proba(validationInput)[:, 1])
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()