In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations

# Define individual models
models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=2, random_state=42),
    'LGBM': LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42),
    'SVC': SVC(probability=True, kernel='rbf', C=1, gamma='scale', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}


In [None]:
import pandas as pd
import numpy as np
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from itertools import combinations
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

# ---------------- Load and Preprocess Data ----------------
# Upload the file manually
uploaded = files.upload()

# Load dataset from the correct filename
df = pd.read_csv("CleanedSoilDataset_With_RefinedSoilQuality.csv")

# Encode categorical target column (Replace 'Soil_Quality' with actual column name)
label_encoder = LabelEncoder()
df['Soil_Quality'] = label_encoder.fit_transform(df['Soil_Quality'])

# Define features (X) and target (y)
X = df.drop(columns=['Soil_Quality'])
y = df['Soil_Quality']

# Feature Selection
feature_selector = RFE(ExtraTreesClassifier(n_estimators=100, random_state=42), n_features_to_select=10)
X_selected = feature_selector.fit_transform(X, y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


Saving CleanedSoilDataset_With_RefinedSoilQuality.csv to CleanedSoilDataset_With_RefinedSoilQuality (3).csv


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE
from google.colab import files

# ---------------- Load and Preprocess Data ----------------
# Upload dataset manually in Google Colab
uploaded = files.upload()
df = pd.read_csv("CleanedSoilDataset_With_RefinedSoilQuality.csv")

# Encode categorical target column (Replace 'Soil_Quality' with actual column name)
label_encoder = LabelEncoder()
df['Soil_Quality'] = label_encoder.fit_transform(df['Soil_Quality'])

# Define features (X) and target (y)
X = df.drop(columns=['Soil_Quality'])
y = df['Soil_Quality']

# Feature Selection
feature_selector = RFE(ExtraTreesClassifier(n_estimators=100, random_state=42), n_features_to_select=10)
X_selected = feature_selector.fit_transform(X, y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Handle Class Imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# Split into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# ---------------- Define Machine Learning Models ----------------
models = {
    'RandomForest': RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=2, random_state=42),
    'LGBM': LGBMClassifier(n_estimators=200, learning_rate=0.05, max_depth=10, random_state=42),
    'ExtraTrees': ExtraTreesClassifier(n_estimators=200, max_depth=20, random_state=42),
    'SVC': SVC(probability=True, kernel='rbf', C=1, gamma='scale', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

# ---------------- Function to Evaluate Model Combinations ----------------
def evaluate_model_combinations(model_dict, X_train, X_test, y_train, y_test):
    results = []

    # Generate all possible model combinations (pairs, triplets, etc.)
    for r in range(2, len(model_dict) + 1):  # Combinations of 2 models up to all models
        for combination in combinations(model_dict.items(), r):
            model_names = [name for name, _ in combination]
            estimators = [(name, model) for name, model in combination]

            # Create and Train Voting Classifier
            ensemble_model = VotingClassifier(estimators=estimators, voting='soft')
            ensemble_model.fit(X_train, y_train)

            # Make Predictions
            y_pred = ensemble_model.predict(X_test)

            # Compute Accuracy
            accuracy = accuracy_score(y_test, y_pred)
            results.append((model_names, accuracy))

            # Print result for each combination
            print(f"Combination: {model_names}, Accuracy: {accuracy:.4f}")

    return results

# ---------------- Evaluate Different Model Combinations ----------------
results = evaluate_model_combinations(models, X_train, X_test, y_train, y_test)

# Sort and Display Top Model Combinations
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

print("\nTop Model Combinations by Accuracy:")
for combo, acc in sorted_results[:5]:  # Show top 5 combinations
    print(f"Models: {combo} | Accuracy: {acc:.4f}")


Saving CleanedSoilDataset_With_RefinedSoilQuality.csv to CleanedSoilDataset_With_RefinedSoilQuality (4).csv




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM'], Accuracy: 0.9726
Combination: ['RandomForest', 'ExtraTrees'], Accuracy: 0.9677
Combination: ['RandomForest', 'SVC'], Accuracy: 0.9786
Combination: ['RandomForest', 'KNN'], Accuracy: 0.9628
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'ExtraTrees'], Accuracy: 0.9754
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'SVC'], Accuracy: 0.9819
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'KNN'], Accuracy: 0.9707
Combination: ['ExtraTrees', 'SVC'], Accuracy: 0.9793
Combination: ['ExtraTrees', 'KNN'], Accuracy: 0.9608
Combination: ['SVC', 'KNN'], Accuracy: 0.9717




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'ExtraTrees'], Accuracy: 0.9723




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'SVC'], Accuracy: 0.9805




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002498 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'KNN'], Accuracy: 0.9712
Combination: ['RandomForest', 'ExtraTrees', 'SVC'], Accuracy: 0.9789
Combination: ['RandomForest', 'ExtraTrees', 'KNN'], Accuracy: 0.9651
Combination: ['RandomForest', 'SVC', 'KNN'], Accuracy: 0.9744
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002380 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'ExtraTrees', 'SVC'], Accuracy: 0.9800
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'ExtraTrees', 'KNN'], Accuracy: 0.9721
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002422 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'SVC', 'KNN'], Accuracy: 0.9782
Combination: ['ExtraTrees', 'SVC', 'KNN'], Accuracy: 0.9742




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'ExtraTrees', 'SVC'], Accuracy: 0.9788




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002481 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'ExtraTrees', 'KNN'], Accuracy: 0.9717




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001304 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'SVC', 'KNN'], Accuracy: 0.9768
Combination: ['RandomForest', 'ExtraTrees', 'SVC', 'KNN'], Accuracy: 0.9747
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002430 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['LGBM', 'ExtraTrees', 'SVC', 'KNN'], Accuracy: 0.9772




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002379 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2408
[LightGBM] [Info] Number of data points in the train set: 22780, number of used features: 10
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438
[LightGBM] [Info] Start training from score -1.609438




Combination: ['RandomForest', 'LGBM', 'ExtraTrees', 'SVC', 'KNN'], Accuracy: 0.9761

Top Model Combinations by Accuracy:
Models: ['LGBM', 'SVC'] | Accuracy: 0.9819
Models: ['RandomForest', 'LGBM', 'SVC'] | Accuracy: 0.9805
Models: ['LGBM', 'ExtraTrees', 'SVC'] | Accuracy: 0.9800
Models: ['ExtraTrees', 'SVC'] | Accuracy: 0.9793
Models: ['RandomForest', 'ExtraTrees', 'SVC'] | Accuracy: 0.9789
