In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
import os

df = pd.read_csv("Fertilizer Prediction.csv")

# Encode categorical features
cat_cols = ["Soil Type", "Crop Type", "Fertilizer Name"]
encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Save encoders for inference later
os.makedirs("model_artifacts", exist_ok=True)
joblib.dump(encoders, "model_artifacts/encoders.pkl")

# Split
X = df.drop("Fertilizer Name", axis=1)
y = df["Fertilizer Name"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# XGBoost format: label must be FIRST column
train_df = pd.concat([y_train, X_train], axis=1)
test_df = pd.concat([y_test, X_test], axis=1)

train_df.to_csv("train.csv", index=False, header=False)
test_df.to_csv("test.csv", index=False, header=False)


In [12]:
import sagemaker
from sagemaker.s3 import S3Uploader

session = sagemaker.Session()
bucket = session.default_bucket()

prefix = "fertilizer-xgboost"

train_s3 = f"s3://{bucket}/{prefix}/train"
test_s3 = f"s3://{bucket}/{prefix}/test"

S3Uploader.upload("train.csv", train_s3)
S3Uploader.upload("test.csv", test_s3)

print("Data uploaded successfully.")


Data uploaded successfully.


In [14]:
import sagemaker
from sagemaker import image_uris

role = sagemaker.get_execution_role()

# Get XGBoost container
container = image_uris.retrieve(
    region=session.boto_session.region_name,
    framework="xgboost",
    version="1.5-1"
)

print("Using container:", container)

xgb_estimator = sagemaker.estimator.Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/output",
)

# Set XGBoost hyperparameters
xgb_estimator.set_hyperparameters(
    objective="multi:softmax",
    num_class=len(encoders["Fertilizer Name"].classes_),
    num_round=200,
    eta=0.2,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
)

xgb_estimator.fit(
    {
        "train": sagemaker.inputs.TrainingInput(
            train_s3,
            content_type="text/csv"
        ),
        "validation": sagemaker.inputs.TrainingInput(
            test_s3,
            content_type="text/csv"
        )
    }
)



INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-23-15-43-27-163


Using container: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1
2025-11-23 15:43:28 Starting - Starting the training job...
2025-11-23 15:43:41 Starting - Preparing the instances for training...
2025-11-23 15:44:04 Downloading - Downloading input data...
2025-11-23 15:44:49 Downloading - Downloading the training image......
2025-11-23 15:45:56 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-23 15:45:47.922 ip-10-2-225-99.ec2.internal:8 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-23 15:45:47.944 ip-10-2-225-99.ec2.internal:8 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-23:15:45:48:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-23:15:45:48:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2025-11-23:15:45:48:INFO] No G