In [78]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

# PREPARE DATA


In [84]:
# Load data
df = pd.read_csv('dataset/district_information.csv')

# Define X and Y
Y_column = ['crop_rank_1', 'crop_rank_2', 'crop_rank_3', 'crop_rank_4', 'crop_rank_5']
X = df.drop(columns=Y_column)
y = df[Y_column]

print(f"Original features: {X.shape[1]}")
print(f"Samples: {len(df)}")

Original features: 27
Samples: 903


# REMOVE REDUNDANT FEATURES

In [89]:
# remove obviously redundant features
redundant = {
    'Source',
    'temperature_2m_min', 'temperature_2m_max',
    'elev_min', 'elev_max',
    'slope_min', 'slope_max',
    'state', 'district',
    'lat', 'lon'
}

# drop the columns
X_reduced = X.drop(redundant, axis=1)
print(f"Features after manual removal: {X_reduced.shape[1]}")


Features after manual removal: 16


In [90]:
# CORRELATION-BASED REMOVAL
def remove_correlated_features(df, threshold=0.85):
    """Remove highly correlated features"""
    # Only use numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr().abs()
    
    # Find features to remove
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    to_remove = [column for column in upper_triangle.columns 
                if any(upper_triangle[column] > threshold)]
    
    return to_remove, corr_matrix

correlated_features, corr_matrix = remove_correlated_features(X_reduced, threshold=0.85)

print(f"🔍 Highly correlated features to remove (correlation > 0.85):")
for feature in correlated_features:
    print(f"   - {feature}")

🔍 Highly correlated features to remove (correlation > 0.85):
   - shortwave_radiation_sum
   - slope_mean
   - OM (%)


In [91]:
# remove the highly correlated features 
X_reduced = X_reduced.drop(columns=correlated_features)
print(f"Features after correlation removal: {X_reduced.shape[1]}")

Features after correlation removal: 13


# ENCODE CATEGORICAL VARIABLES

In [92]:
X_processed = X_reduced.copy()

# Identify categorical columns
categorical_cols = X_processed.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_processed[col] = le.fit_transform(X_processed[col].astype(str))
    label_encoders[col] = le

# Handle missing values
X_processed = X_processed.fillna(X_processed.median())

print(f"Final preprocessed features: {X_processed.shape[1]}")

Categorical columns: ['crop_type', 'Land Use']
Final preprocessed features: 13
