In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [4]:
# Load dataset
df = pd.read_csv("/Users/dilshanperera/Desktop/Soil Project/ModelNew/classified_soil_with_qualityNew.csv")

# Encode categorical variables (Texture, Soil Quality, Industrial Use)
le_texture = LabelEncoder()
df['Texture'] = le_texture.fit_transform(df['Texture'])

le_soil_quality = LabelEncoder()
df['Soil Quality'] = le_soil_quality.fit_transform(df['Soil Quality'])

le_industrial_use = LabelEncoder()
df['Industrial Use'] = le_industrial_use.fit_transform(df['Industrial Use'])


In [6]:
# Define features and labels
X = df[['Texture', 'Moisture (%)', 'Organic Matter (%)', 'pH', 'Electrical Conductivity (dS/m)']]
y_soil_quality = df['Soil Quality']
y_industrial_use = df['Industrial Use']

# Split the data into training and test sets
X_train, X_test, y_train_soil_quality, y_test_soil_quality = train_test_split(X, y_soil_quality, test_size=0.2, random_state=42)
_, _, y_train_industrial_use, y_test_industrial_use = train_test_split(X, y_industrial_use, test_size=0.2, random_state=42)

In [8]:
# Handle imbalance with SMOTE (Oversampling)
smote = SMOTE(random_state=42)

# For Soil Quality
X_resampled_soil_quality, y_resampled_soil_quality = smote.fit_resample(X_train, y_train_soil_quality)

# For Industrial Use
X_resampled_industrial_use, y_resampled_industrial_use = smote.fit_resample(X_train, y_train_industrial_use)


In [10]:
# Build and train the Random Forest Classifier for Soil Quality
rf_soil_quality = RandomForestClassifier(n_estimators=100, random_state=42)
rf_soil_quality.fit(X_resampled_soil_quality, y_resampled_soil_quality)

# Build and train the Random Forest Classifier for Industrial Use
rf_industrial_use = RandomForestClassifier(n_estimators=100, random_state=42)
rf_industrial_use.fit(X_resampled_industrial_use, y_resampled_industrial_use)


In [12]:
# Predict Soil Quality and calculate accuracy
y_pred_soil_quality = rf_soil_quality.predict(X_test)
accuracy_soil_quality = accuracy_score(y_test_soil_quality, y_pred_soil_quality)
print(f"Accuracy of Soil Quality prediction (RF): {accuracy_soil_quality * 100:.2f}%")
print("Soil Quality Classification Report:")
print(classification_report(y_test_soil_quality, y_pred_soil_quality, target_names=le_soil_quality.classes_))

# Predict Industrial Use and calculate accuracy
y_pred_industrial_use = rf_industrial_use.predict(X_test)
accuracy_industrial_use = accuracy_score(y_test_industrial_use, y_pred_industrial_use)
print(f"Accuracy of Industrial Use prediction (RF): {accuracy_industrial_use * 100:.2f}%")
print("Industrial Use Classification Report:")
print(classification_report(y_test_industrial_use, y_pred_industrial_use, target_names=le_industrial_use.classes_))


Accuracy of Soil Quality prediction (RF): 99.84%
Soil Quality Classification Report:
              precision    recall  f1-score   support

     Average       1.00      0.99      0.99        99
         Bad       1.00      1.00      1.00      2649
        Good       1.00      1.00      1.00      1006

    accuracy                           1.00      3754
   macro avg       1.00      0.99      1.00      3754
weighted avg       1.00      1.00      1.00      3754

Accuracy of Industrial Use prediction (RF): 99.92%
Industrial Use Classification Report:
                     precision    recall  f1-score   support

  Biogas Production       1.00      1.00      1.00       275
Brick Manufacturing       1.00      1.00      1.00       313
       Construction       1.00      1.00      1.00        91
       Glass Making       1.00      0.99      0.99        99
   Land Reclamation       1.00      0.99      1.00       327
       Not Suitable       1.00      1.00      1.00      2649

           accur

In [18]:
from sklearn.linear_model import LogisticRegression
# Create and train the Logistic Regression model for Soil Quality
lr_soil_quality = LogisticRegression(max_iter=200, random_state=42)
lr_soil_quality.fit(X_resampled_soil_quality, y_resampled_soil_quality)

# Create and train the Logistic Regression model for Industrial Use
lr_industrial_use = LogisticRegression(max_iter=200, random_state=42)
lr_industrial_use.fit(X_resampled_industrial_use, y_resampled_industrial_use)
# Predict Soil Quality and calculate accuracy for Logistic Regression
y_pred_soil_quality_lr = lr_soil_quality.predict(X_test)
accuracy_soil_quality_lr = accuracy_score(y_test_soil_quality, y_pred_soil_quality_lr)
print(f"Accuracy of Soil Quality prediction (Logistic Regression): {accuracy_soil_quality_lr * 100:.2f}%")

# Print classification report for Soil Quality
print("Soil Quality Classification Report (Logistic Regression):")
print(classification_report(y_test_soil_quality, y_pred_soil_quality_lr, target_names=le_soil_quality.classes_))

# Predict Industrial Use and calculate accuracy for Logistic Regression
y_pred_industrial_use_lr = lr_industrial_use.predict(X_test)
accuracy_industrial_use_lr = accuracy_score(y_test_industrial_use, y_pred_industrial_use_lr)
print(f"Accuracy of Industrial Use prediction (Logistic Regression): {accuracy_industrial_use_lr * 100:.2f}%")

# Print classification report for Industrial Use
print("Industrial Use Classification Report (Logistic Regression):")
print(classification_report(y_test_industrial_use, y_pred_industrial_use_lr, target_names=le_industrial_use.classes_))


Accuracy of Soil Quality prediction (Logistic Regression): 74.83%
Soil Quality Classification Report (Logistic Regression):
              precision    recall  f1-score   support

     Average       0.39      1.00      0.56        99
         Bad       0.92      0.71      0.80      2649
        Good       0.57      0.83      0.68      1006

    accuracy                           0.75      3754
   macro avg       0.63      0.85      0.68      3754
weighted avg       0.81      0.75      0.76      3754

Accuracy of Industrial Use prediction (Logistic Regression): 65.53%
Industrial Use Classification Report (Logistic Regression):
                     precision    recall  f1-score   support

  Biogas Production       0.55      0.88      0.68       275
Brick Manufacturing       0.53      0.93      0.68       313
       Construction       0.21      0.97      0.34        91
       Glass Making       0.43      0.99      0.60        99
   Land Reclamation       0.42      0.74      0.54       327


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
# Build and train the Random Forest Classifier with class weights for Soil Quality
rf_soil_quality_weighted = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_soil_quality_weighted.fit(X_resampled_soil_quality, y_resampled_soil_quality)

# Build and train the Random Forest Classifier with class weights for Industrial Use
rf_industrial_use_weighted = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_industrial_use_weighted.fit(X_resampled_industrial_use, y_resampled_industrial_use)


In [24]:
# Function to make predictions based on user input
def predict_soil_quality_industrial_use(texture, moisture, organic_matter, ph, conductivity):
    # Prepare the input for prediction
    input_data = pd.DataFrame([[texture, moisture, organic_matter, ph, conductivity]], columns=['Texture', 'Moisture (%)', 'Organic Matter (%)', 'pH', 'Electrical Conductivity (dS/m)'])

    # Predict Soil Quality
    soil_quality_pred = rf_soil_quality.predict(input_data)
    soil_quality = le_soil_quality.inverse_transform(soil_quality_pred)[0]
    
    # Predict Industrial Use
    industrial_use_pred = rf_industrial_use.predict(input_data)
    industrial_use = le_industrial_use.inverse_transform(industrial_use_pred)[0]
    
    return soil_quality, industrial_use

# Function to handle user input and make predictions
def get_user_input_and_predict():
    # Get user input
    texture_input = int(input("Enter the Texture Type (Clayey=0, Sandy=1, Sandy Loam=2, Loamy=3): "))
    moisture_input = float(input("Enter Moisture (%): "))
    organic_matter_input = float(input("Enter Organic Matter (%): "))
    ph_input = float(input("Enter pH: "))
    conductivity_input = float(input("Enter Electrical Conductivity (dS/m): "))

    # Make predictions based on user input
    soil_quality, industrial_use = predict_soil_quality_industrial_use(
        texture_input, moisture_input, organic_matter_input, ph_input, conductivity_input
    )

    # Display predictions
    print(f"Predicted Soil Quality: {soil_quality}")
    print(f"Predicted Industrial Use: {industrial_use}")

# Example of how to use the function
get_user_input_and_predict()


Enter the Texture Type (Clayey=0, Sandy=1, Sandy Loam=2, Loamy=3):  2
Enter Moisture (%):  11.812793394370178
Enter Organic Matter (%):  4.383395818956094
Enter pH:  6.055422366496843
Enter Electrical Conductivity (dS/m):  1.3037990908942991


Predicted Soil Quality: Good
Predicted Industrial Use: Construction


In [26]:
import pickle 

# Save the models
with open("rf_soil_quality_model.pkl", "wb") as f:
    pickle.dump(rf_soil_quality, f)

with open("rf_industrial_use_model.pkl", "wb") as f:
    pickle.dump(rf_industrial_use, f)

In [6]:
import pickle
import pandas as pd

# Load the trained models
with open("rf_soil_quality_model.pkl", "rb") as f:
    rf_soil_quality = pickle.load(f)

with open("rf_industrial_use_model.pkl", "rb") as f:
    rf_industrial_use = pickle.load(f)

# Define the mapping for label-encoded categories
soil_quality_mapping = {0: "Poor", 1: "Moderate", 2: "Good"}  # Adjust based on original encoding
industrial_use_mapping = {0: "Agriculture", 1: "Construction", 2: "Landscaping"}  # Adjust accordingly

def predict_soil_quality_industrial_use(texture, moisture, organic_matter, ph, conductivity):
    # Prepare the input for prediction
    input_data = pd.DataFrame([[texture, moisture, organic_matter, ph, conductivity]], 
                               columns=['Texture', 'Moisture (%)', 'Organic Matter (%)', 'pH', 'Electrical Conductivity (dS/m)'])

    # Predict Soil Quality
    soil_quality_pred = rf_soil_quality.predict(input_data)[0]
    soil_quality = soil_quality_mapping.get(soil_quality_pred, "Unknown")

    # Predict Industrial Use
    industrial_use_pred = rf_industrial_use.predict(input_data)[0]
    industrial_use = industrial_use_mapping.get(industrial_use_pred, "Unknown")

    return soil_quality, industrial_use

# Get user input
texture_input = int(input("Enter the Texture Type (Clayey=0, Sandy=1, Sandy Loam=2, Loamy=3): "))
moisture_input = float(input("Enter Moisture (%): "))
organic_matter_input = float(input("Enter Organic Matter (%): "))
ph_input = float(input("Enter pH: "))
conductivity_input = float(input("Enter Electrical Conductivity (dS/m): "))

# Make predictions
soil_quality, industrial_use = predict_soil_quality_industrial_use(
    texture_input, moisture_input, organic_matter_input, ph_input, conductivity_input
)

# Display predictions
print(f"Predicted Soil Quality: {soil_quality}")
print(f"Predicted Industrial Use: {industrial_use}")


Enter the Texture Type (Clayey=0, Sandy=1, Sandy Loam=2, Loamy=3):  3
Enter Moisture (%):  56
Enter Organic Matter (%):  34
Enter pH:  6.8767
Enter Electrical Conductivity (dS/m):  4.97877


Predicted Soil Quality: Moderate
Predicted Industrial Use: Unknown


In [12]:
#import streamlit as st
import pickle
import pandas as pd

# Load the trained models
with open("rf_soil_quality_model.pkl", "rb") as f:
    rf_soil_quality = pickle.load(f)

with open("rf_industrial_use_model.pkl", "rb") as f:
    rf_industrial_use = pickle.load(f)

# Define the mapping for label-encoded categories
soil_quality_mapping = {0: "Poor", 1: "Moderate", 2: "Good"}  # Adjust based on original encoding
industrial_use_mapping = {0: "Agriculture", 1: "Construction", 2: "Landscaping"}  # Adjust accordingly

# Streamlit UI
st.title("🌱 Soil Quality & Industrial Use Prediction")
st.write("Enter soil parameters below to predict its quality and industrial usability.")

# Dropdown for texture type
texture_options = {"Clayey": 0, "Sandy": 1, "Sandy Loam": 2, "Loamy": 3}
texture = st.selectbox("Select Texture Type:", list(texture_options.keys()))
texture_input = texture_options[texture]

# Input fields
moisture_input = st.slider("Moisture (%)", 0.0, 100.0, 20.0)
organic_matter_input = st.slider("Organic Matter (%)", 0.0, 10.0, 2.5)
ph_input = st.number_input("pH", min_value=0.0, max_value=14.0, value=7.0, step=0.1)
conductivity_input = st.number_input("Electrical Conductivity (dS/m)", min_value=0.0, max_value=10.0, value=1.0, step=0.1)

# Prediction function
def predict_soil_quality_industrial_use(texture, moisture, organic_matter, ph, conductivity):
    input_data = pd.DataFrame([[texture, moisture, organic_matter, ph, conductivity]],
                               columns=['Texture', 'Moisture (%)', 'Organic Matter (%)', 'pH', 'Electrical Conductivity (dS/m)'])
    soil_quality_pred = rf_soil_quality.predict(input_data)[0]
    industrial_use_pred = rf_industrial_use.predict(input_data)[0]
    return soil_quality_mapping.get(soil_quality_pred, "Unknown"), industrial_use_mapping.get(industrial_use_pred, "Unknown")

# Predict button
if st.button("Predict"):  
    soil_quality, industrial_use = predict_soil_quality_industrial_use(texture_input, moisture_input, organic_matter_input, ph_input, conductivity_input)
    
    # Display results
    st.success(f"**Predicted Soil Quality:** {soil_quality}")
    st.info(f"**Predicted Industrial Use:** {industrial_use}")