# Importing modules and libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

# Loading the datasets

In [2]:
train_df = pd.read_csv('Train(1).csv')
test_df = pd.read_csv('Test.csv')
test_gap_df = pd.read_csv('Gap_Test.csv')

Combining Gap_Test and Test datasets

In [5]:
test_gap_df = pd.merge(test_gap_df, test_df[['PID', 'BulkDensity']], on='PID', how='left')

Filling in null values

In [7]:
for column in train_df.columns:
  if train_df[column].isnull().any():
    train_df[column].fillna(train_df[column].mean(), inplace=True)

for column in test_df.columns:
  if test_df[column].isnull().any():
    test_df[column].fillna(test_df[column].mean(), inplace=True)


# Feature engineering

In [None]:
# Define target columns
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']

# --- Feature Engineering ---
def create_features(df):
    if 'pH' in df.columns and 'bio1' in df.columns:
        df['pH_x_bio1'] = df['pH'] * df['bio1']
    if 'pH' in df.columns and 'bio12' in df.columns:
        df['pH_x_bio12'] = df['pH'] * df['bio12']
    if 'soc20' in df.columns and 'BulkDensity' in df.columns:
        df['soc20_x_BulkDensity'] = df['soc20'] * df['BulkDensity']
    if 'bio12' in df.columns and 'bio1' in df.columns:
        df['bio12_div_bio1'] = df['bio12'] / df['bio1'].replace(0, np.nan)
    return df

# Apply feature engineering
train_df = create_features(train_df.copy())
test_df = create_features(test_df.copy())

# Drop unnecessary columns
features_to_drop = target_columns + ['PID', 'site']
X = train_df.drop(columns=[col for col in features_to_drop if col in train_df.columns])
y = train_df[target_columns]
X_test = test_df.drop(columns=[col for col in ['PID', 'site'] if col in test_df.columns])
test_pids = test_df['PID']

# Align columns between train and test
for col in set(X.columns) - set(X_test.columns):
    X_test[col] = 0
for col in set(X_test.columns) - set(X.columns):
    X[col] = 0
X_test = X_test[X.columns]


Selecting the features and labels, then splitting the data

In [9]:
X = train_df.drop(columns=target_columns)
y = train_df[target_columns]
X_test = test_df.drop(columns=['PID',"site"])

In [27]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
X_train = X_train.drop(columns=['PID','site'])
X_val = X_val.drop(columns=['PID','site'])

# Training the model

In [30]:
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100))
model.fit(X_train, y_train)

Model evaluation

In [31]:
y_pred = model.predict(X_val)

In [32]:
# Evaluate model
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print(f'MAE: {mae:.4f}, RMSE: {rmse:.4f}')

MAE: 159.4411, RMSE: 479.8398


# Creating the submission file

(Following the format given in the challenge info section)

In [33]:
t_p = model.predict(X_test)
test_predictions = np.expm1(t_p)
# Split the predictions into separate columns
N_pred =  test_predictions[:, 0]  # Predictions for N
P_pred =  test_predictions[:, 1]  # Predictions for P
K_pred =  test_predictions[:, 2]  # Predictions for K
Ca_pred = test_predictions[:, 3]  # Predictions for Ca
Mg_pred = test_predictions[:, 4]  # Predictions for Mg
S_pred =  test_predictions[:, 5]  # Predictions for S
Fe_pred = test_predictions[:, 6]  # Predictions for Fe
Mn_pred = test_predictions[:, 7]  # Predictions for Mn
Zn_pred = test_predictions[:, 8]  # Predictions for Zn
Cu_pred = test_predictions[:, 9]  # Predictions for Cu
B_pred =  test_predictions[:, 10]  # Predictions for B


submission = pd.DataFrame({'PID': test_df['PID'], 'N': N_pred, 'P': P_pred, 'K': K_pred, 'Ca': Ca_pred, 'Mg': Mg_pred, 'S': S_pred, 'Fe': Fe_pred, 'Mn': Mn_pred, 'Zn': Zn_pred, 'Cu': Cu_pred, 'B': B_pred})



submission_melted = submission.melt(id_vars=['PID'], var_name='Nutrient', value_name='Available_Nutrients_in_ppm')
submission_melted = submission_melted.sort_values('PID')
nutrient_df = pd.merge(test_gap_df, submission_melted, on=['PID', 'Nutrient'], how='left')

nutrient_df['Available_Nutrients_in_kg_ha'] = (nutrient_df['Available_Nutrients_in_ppm']
                                               * 20 * nutrient_df['BulkDensity'] * 0.1)

nutrient_df["Gap"] = nutrient_df["Required"] - nutrient_df["Available_Nutrients_in_kg_ha"]
nutrient_df['ID'] = nutrient_df['PID'] + "_" + nutrient_df['Nutrient']
nutrient_df = nutrient_df[['ID', 'Gap']]

In [34]:
nutrient_df.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
