In [1]:
# Cell 1: Install and import libraries
import pandas as pd
import numpy as np
import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import io
import json

print("‚úÖ Libraries imported")
print(f"SageMaker version: {sagemaker.__version__}")

# Get SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = 'soilmonitoring-models'
prefix = 'xgboost-crop'

print(f"SageMaker role: {role}")
print(f"S3 bucket: {bucket}")
print(f"S3 prefix: {prefix}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
‚úÖ Libraries imported
SageMaker version: 2.254.1
SageMaker role: arn:aws:iam::065126182608:role/LabRole
S3 bucket: soilmonitoring-models
S3 prefix: xgboost-crop


In [2]:
# Cell 2: Load and prepare data for XGBoost
print("üìÇ Loading dataset...")

# Load CSV
df = pd.read_csv('Crop_recommendation.csv')

# Remove unnamed index if exists
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows:")
display(df.head())

# Features and target
feature_columns = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
target_column = 'label'

X = df[feature_columns]
y = df[target_column]

print(f"\n‚úÖ Features: {feature_columns}")
print(f"‚úÖ Target: {target_column}")
print(f"‚úÖ Number of crops: {y.nunique()}")
print(f"\nüåæ Crops: {sorted(y.unique())}")

üìÇ Loading dataset...
Dataset shape: (2200, 8)
Columns: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']

First 5 rows:


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice



‚úÖ Features: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
‚úÖ Target: label
‚úÖ Number of crops: 22

üåæ Crops: ['apple', 'banana', 'blackgram', 'chickpea', 'coconut', 'coffee', 'cotton', 'grapes', 'jute', 'kidneybeans', 'lentil', 'maize', 'mango', 'mothbeans', 'mungbean', 'muskmelon', 'orange', 'papaya', 'pigeonpeas', 'pomegranate', 'rice', 'watermelon']


In [3]:
# Cell 3: Encode labels and split data
print("üîß Encoding labels...")

# Encode crop names to numbers (XGBoost needs numeric labels)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Label mapping:")
for i, crop in enumerate(label_encoder.classes_):
    print(f"  {crop}: {i}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded
)

print(f"\nüìä Data split:")
print(f"  Training: {len(X_train)} samples")
print(f"  Testing: {len(X_test)} samples")

# Save label encoder for later
import joblib
joblib.dump(label_encoder, 'crop_label_encoder_xgboost.pkl')
print(f"\n‚úÖ Label encoder saved")

üîß Encoding labels...
Label mapping:
  apple: 0
  banana: 1
  blackgram: 2
  chickpea: 3
  coconut: 4
  coffee: 5
  cotton: 6
  grapes: 7
  jute: 8
  kidneybeans: 9
  lentil: 10
  maize: 11
  mango: 12
  mothbeans: 13
  mungbean: 14
  muskmelon: 15
  orange: 16
  papaya: 17
  pigeonpeas: 18
  pomegranate: 19
  rice: 20
  watermelon: 21

üìä Data split:
  Training: 1760 samples
  Testing: 440 samples

‚úÖ Label encoder saved


In [4]:
# Cell 4: Create training data in XGBoost format
print("üìù Preparing data for SageMaker XGBoost...")

# XGBoost expects: label, feature1, feature2, ...
# No header, label first

# Training data
train_data = pd.concat([
    pd.Series(y_train, name='label'),
    X_train.reset_index(drop=True)
], axis=1)

# Test data
test_data = pd.concat([
    pd.Series(y_test, name='label'),
    X_test.reset_index(drop=True)
], axis=1)

print(f"Training data shape: {train_data.shape}")
print(f"Test data shape: {test_data.shape}")
print(f"\nFirst 3 rows of training data:")
display(train_data.head(3))

# Save to CSV (no header, no index)
train_file = 'train.csv'
test_file = 'test.csv'

train_data.to_csv(train_file, header=False, index=False)
test_data.to_csv(test_file, header=False, index=False)

print(f"\n‚úÖ Saved: {train_file}")
print(f"‚úÖ Saved: {test_file}")

üìù Preparing data for SageMaker XGBoost...
Training data shape: (1760, 8)
Test data shape: (440, 8)

First 3 rows of training data:


Unnamed: 0,label,N,P,K,temperature,humidity,ph,rainfall
0,16,0,18,14,29.771494,92.0072,7.207991,114.416179
1,7,9,122,201,29.587484,80.919344,5.570291,68.064173
2,9,11,71,24,21.140114,22.718235,5.60662,141.605672



‚úÖ Saved: train.csv
‚úÖ Saved: test.csv


In [5]:
# Cell 5: Upload training data to S3
print("üì§ Uploading data to S3...")

s3_client = boto3.client('s3')

# Upload training data
train_s3_path = f'{prefix}/train/train.csv'
test_s3_path = f'{prefix}/test/test.csv'

s3_client.upload_file('train.csv', bucket, train_s3_path)
s3_client.upload_file('test.csv', bucket, test_s3_path)

train_input = f's3://{bucket}/{train_s3_path}'
test_input = f's3://{bucket}/{test_s3_path}'

print(f"‚úÖ Training data: {train_input}")
print(f"‚úÖ Test data: {test_input}")

# Also upload label encoder
encoder_s3_path = f'{prefix}/artifacts/crop_label_encoder_xgboost.pkl'
s3_client.upload_file('crop_label_encoder_xgboost.pkl', bucket, encoder_s3_path)
print(f"‚úÖ Label encoder: s3://{bucket}/{encoder_s3_path}")

üì§ Uploading data to S3...
‚úÖ Training data: s3://soilmonitoring-models/xgboost-crop/train/train.csv
‚úÖ Test data: s3://soilmonitoring-models/xgboost-crop/test/test.csv
‚úÖ Label encoder: s3://soilmonitoring-models/xgboost-crop/artifacts/crop_label_encoder_xgboost.pkl


In [6]:
# Cell 6: Get XGBoost container image
from sagemaker.image_uris import retrieve

print("üê≥ Getting XGBoost container image...")

# Get XGBoost container for your region
region = boto3.Session().region_name
container = retrieve('xgboost', region, version='1.5-1')

print(f"Region: {region}")
print(f"XGBoost container: {container}")

üê≥ Getting XGBoost container image...
Region: us-east-1
XGBoost container: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1


In [7]:
# Cell 7: Train XGBoost model with SageMaker
print("ü§ñ Configuring XGBoost training job...")

from sagemaker.estimator import Estimator

# Define output path
output_path = f's3://{bucket}/{prefix}/output'

# Create XGBoost estimator
xgboost_estimator = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.m5.large',  # Training instance
    output_path=output_path,
    sagemaker_session=sagemaker_session,
    base_job_name='xgboost-crop-recommendation'
)

# Set hyperparameters
xgboost_estimator.set_hyperparameters(
    objective='multi:softmax',  # Multi-class classification
    num_class=len(label_encoder.classes_),  # Number of crop types
    num_round=100,  # Number of boosting rounds
    max_depth=6,
    eta=0.3,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='merror'  # Multi-class error
)

print("‚úÖ XGBoost estimator configured")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"Output path: {output_path}")

# Prepare data channels
train_channel = TrainingInput(train_input, content_type='text/csv')
test_channel = TrainingInput(test_input, content_type='text/csv')

print("\nüöÄ Starting training job...")
print("‚è∞ This will take 5-10 minutes...")

# Start training
xgboost_estimator.fit({
    'train': train_channel,
    'validation': test_channel
})

print("\n‚úÖ Training complete!")


INFO:sagemaker:Creating training-job with name: xgboost-crop-recommendation-2025-11-22-21-09-08-426


ü§ñ Configuring XGBoost training job...
‚úÖ XGBoost estimator configured
Number of classes: 22
Output path: s3://soilmonitoring-models/xgboost-crop/output

üöÄ Starting training job...
‚è∞ This will take 5-10 minutes...
2025-11-22 21:09:09 Starting - Starting the training job...
2025-11-22 21:09:25 Starting - Preparing the instances for training...
2025-11-22 21:09:47 Downloading - Downloading input data...
2025-11-22 21:10:30 Downloading - Downloading the training image......
2025-11-22 21:11:41 Training - Training image download completed. Training in progress.
  from pandas import MultiIndex, Int64Index[0m
[34m[2025-11-22 21:11:32.733 ip-10-0-226-198.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2025-11-22 21:11:32.759 ip-10-0-226-198.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2025-11-22:21:11:33:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2025-11-22:21:11:33:INFO] Failed t

In [14]:

import boto3
import pandas as pd

runtime = boto3.client("sagemaker-runtime")
endpoint_name = "xgboost-crop-endpoint"

# Example row
df = pd.DataFrame({
    "N": [0],
    "P": [18],
    "K": [14],
    "temperature": [29.771494],
    "humidity": [92.007200],
    "ph": [7.207991],
    "rainfall": [114.416179],
})

payload = df.to_csv(index=False, header=False).strip()

response = runtime.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType="text/csv",
    Body=payload
)

raw_pred = response["Body"].read().decode("utf-8").strip()
pred_class = int(float(raw_pred))

label_map = {
    0: "apple",
    1: "banana",
    2: "blackgram",
    3: "chickpea",
    4: "coconut",
    5: "coffee",
    6: "cotton",
    7: "grapes",
    8: "jute",
    9: "kidneybeans",
    10: "lentil",
    11: "maize",
    12: "mango",
    13: "mothbeans",
    14: "mungbean",
    15: "muskmelon",
    16: "orange",
    17: "papaya",
    18: "pigeonpeas",
    19: "pomegranate",
    20: "rice",
    21: "watermelon",
}  # insert mapping above

print("Predicted class:", pred_class)
print("Predicted crop:", label_map[pred_class])


Predicted class: 16
Predicted crop: orange
