
# Data Preparation

In [78]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns

In [79]:
# Load the dataset
df = pd.read_csv('dataset/Crops_District_Production.csv')
# basic Information
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nColumn info:")
print(df.info())
# Check for any missing values
print("\nMissing values:")
print(df.isnull().sum())

Dataset shape: (11002, 6)

First few rows:
         date  state    district   crop_type  crop_species  production
0  2017-01-01  Johor  Batu Pahat  cash_crops       cassava       920.5
1  2017-01-01  Johor  Batu Pahat  cash_crops    groundnuts         0.0
2  2017-01-01  Johor  Batu Pahat  cash_crops    sweet_corn         0.0
3  2017-01-01  Johor  Batu Pahat  cash_crops  sweet_potato       350.0
4  2017-01-01  Johor  Batu Pahat  cash_crops           yam       395.4

Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11002 entries, 0 to 11001
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          11002 non-null  object 
 1   state         11002 non-null  object 
 2   district      11002 non-null  object 
 3   crop_type     11002 non-null  object 
 4   crop_species  11002 non-null  object 
 5   production    11002 non-null  float64
dtypes: float64(1), object(5)
memory usage: 515.8+ KB
None

Missi

In [80]:
# Group by state, district, crop_type and find top 5 species by production
def transform_to_top5_format(df):
    """
    Transform crop data to show top 5 species per district/crop_type
    Modern pandas approach that avoids all warnings
    """
    # Sort by production in descending order
    df_sorted = df.sort_values(['state', 'district', 'crop_type', 'production'], 
                              ascending=[True, True, True, False])
    
    # Get top 5 for each group and explicitly create a copy
    top5_df = df_sorted.groupby(['state', 'district', 'crop_type']).head(5).copy()
    
    # Create ranking within each group using .loc to avoid warning
    top5_df.loc[:, 'rank'] = top5_df.groupby(['state', 'district', 'crop_type']).cumcount() + 1
    
    # Pivot to get the desired format
    result = top5_df.pivot_table(
        index=['state', 'district', 'crop_type'],
        columns='rank',
        values='crop_species',
        aggfunc='first'
    ).reset_index()
    
    # Rename columns to crop_rank_1, crop_rank_2, etc.
    result.columns.name = None  # Remove the 'rank' column name
    new_columns = ['state', 'district', 'crop_type'] + [f'crop_rank_{i}' for i in range(1, 6)]
    result.columns = new_columns[:len(result.columns)]
    
    return result

# Transform the data
df = transform_to_top5_format(df)

# Display results
print("Transformed dataset shape:", df.shape)
print("\nFirst 10 rows:")
print(df.head(10))

Transformed dataset shape: (903, 8)

First 10 rows:
   state     district         crop_type           crop_rank_1  \
0  Johor   Batu Pahat        cash_crops               cassava   
1  Johor   Batu Pahat             fruit             pineapple   
2  Johor   Batu Pahat             herbs  fragrant_lemon_grass   
3  Johor   Batu Pahat  industrial_crops               coconut   
4  Johor   Batu Pahat            spices       calamondin_lime   
5  Johor   Batu Pahat         vegetable             long_bean   
6  Johor  Johor Bahru        cash_crops               cassava   
7  Johor  Johor Bahru             fruit                banana   
8  Johor  Johor Bahru             herbs  fragrant_lemon_grass   
9  Johor  Johor Bahru  industrial_crops               coconut   

         crop_rank_2       crop_rank_3       crop_rank_4   crop_rank_5  
0                yam      sweet_potato        groundnuts    sweet_corn  
1             papaya            durian            banana         guava  
2            

In [81]:
# Combine all csv data into single dataset:
print("Load all CSV files...")

# disctrict data and corps
print(f"District data and Corps shape: {df.shape}")

# Average Yearly Weather Information
df_weather = pd.read_csv('dataset/district_with_weather_data.csv')
print(f"Weather information shape: {df_weather.shape}")

#  district elevation and slope
df_elevation = pd.read_csv('dataset/elevation and slope dataset 155 districts.csv')
print(f"District elevation and slope shape: {df_elevation.shape}")

# district soil type
df_soil = pd.read_csv('dataset/Final_Malaysia_Soil_Dataset__155_Districts_csv.csv')
print(f"District soil type shape: {df_soil.shape}")

Load all CSV files...
District data and Corps shape: (903, 8)
Weather information shape: (155, 14)
District elevation and slope shape: (155, 9)
District soil type shape: (155, 7)


in here, we see that the disctrict row data is bigger than the climate and elevation data because of multiple crop_type on each district.

In [82]:
# data cleaning
# Clean column names (remove extra spaces)
df_weather.columns = df_weather.columns.str.strip()
df_elevation.columns = df_elevation.columns.str.strip()
df_soil.columns = df_soil.columns.str.strip()
# Standardize state and district names (remove extra spaces, standardize case)
for dfw in [df_weather, df_elevation, df_soil]:
    dfw['state'] = dfw['state'].str.strip().str.title()
    dfw['district'] = dfw['district'].str.strip().str.title()

print("Data cleaned - standardized state/district names")

Data cleaned - standardized state/district names


In [83]:
# Merging dataset
print("MERGING DATASETS")

# Step 1: Start with main crop production data (using existing df)
combined_df = df.copy()
print(f"Starting with crop data: {combined_df.shape}")

# Step 2: Merge with weather data
combined_df = pd.merge(
    combined_df, 
    df_weather, 
    on=['state', 'district'], 
    how='left'
)
print(f"After adding climate data: {combined_df.shape}")

# step 3: Merge with elevation and slope data
combined_df = pd.merge(
    combined_df,
    df_elevation,
    on=['state', 'district'],
    how='left'
)
print(f"After adding elevation and slope data: {combined_df.shape}")

# step 4: Merge with soil data
combined_df = pd.merge(
    combined_df,
    df_soil,
    on=['state', 'district'],
    how='left'
)
print(f"After adding soil data: {combined_df.shape}")
print()
print("COMBINED DATASET")
print(f"Final dataset shape: {combined_df.shape}")
print(f"Columns: {combined_df.columns}")

# save the new dataset csv
combined_df.to_csv('dataset/district_information.csv', index=False)


MERGING DATASETS
Starting with crop data: (903, 8)
After adding climate data: (903, 20)
After adding elevation and slope data: (903, 27)
After adding soil data: (903, 32)

COMBINED DATASET
Final dataset shape: (903, 32)
Columns: Index(['state', 'district', 'crop_type', 'crop_rank_1', 'crop_rank_2',
       'crop_rank_3', 'crop_rank_4', 'crop_rank_5', 'lat', 'lon',
       'uv_index_max', 'temperature_2m_mean', 'cloud_cover_mean',
       'relative_humidity_2m_mean', 'sunshine_duration', 'precipitation_sum',
       'et0_fao_evapotranspiration', 'temperature_2m_min',
       'temperature_2m_max', 'shortwave_radiation_sum', 'elev_mean',
       'elev_min', 'elev_max', 'slope_mean', 'slope_min', 'slope_max',
       'slope_std', 'Land Use', 'OC (%)', 'OM (%)', 'Soil Depth (cm)',
       'Source'],
      dtype='object')


now, we can start the process

# PREPARE DATA


In [84]:
# Load data
df = pd.read_csv('dataset/district_information.csv')

# Define X and Y
Y_column = ['crop_rank_1', 'crop_rank_2', 'crop_rank_3', 'crop_rank_4', 'crop_rank_5']
X = df.drop(columns=Y_column)
y = df[Y_column]

print(f"Original features: {X.shape[1]}")
print(f"Samples: {len(df)}")

Original features: 27
Samples: 903


# REMOVE REDUNDANT FEATURES

In [89]:
# remove obviously redundant features
redundant = {
    'Source',
    'temperature_2m_min', 'temperature_2m_max',
    'elev_min', 'elev_max',
    'slope_min', 'slope_max',
    'state', 'district',
    'lat', 'lon'
}

# drop the columns
X_reduced = X.drop(redundant, axis=1)
print(f"Features after manual removal: {X_reduced.shape[1]}")


Features after manual removal: 16


In [90]:
# CORRELATION-BASED REMOVAL
def remove_correlated_features(df, threshold=0.85):
    """Remove highly correlated features"""
    # Only use numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    corr_matrix = df[numeric_cols].corr().abs()
    
    # Find features to remove
    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    to_remove = [column for column in upper_triangle.columns 
                if any(upper_triangle[column] > threshold)]
    
    return to_remove, corr_matrix

correlated_features, corr_matrix = remove_correlated_features(X_reduced, threshold=0.85)

print(f"🔍 Highly correlated features to remove (correlation > 0.85):")
for feature in correlated_features:
    print(f"   - {feature}")

🔍 Highly correlated features to remove (correlation > 0.85):
   - shortwave_radiation_sum
   - slope_mean
   - OM (%)


In [91]:
# remove the highly correlated features 
X_reduced = X_reduced.drop(columns=correlated_features)
print(f"Features after correlation removal: {X_reduced.shape[1]}")

Features after correlation removal: 13


# ENCODE CATEGORICAL VARIABLES

In [92]:
X_processed = X_reduced.copy()

# Identify categorical columns
categorical_cols = X_processed.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X_processed[col] = le.fit_transform(X_processed[col].astype(str))
    label_encoders[col] = le

# Handle missing values
X_processed = X_processed.fillna(X_processed.median())

print(f"Final preprocessed features: {X_processed.shape[1]}")

Categorical columns: ['crop_type', 'Land Use']
Final preprocessed features: 13
