# Crop Production Optimizer Notebook


### Steps to follow the process

Importing necessary Libraries

Creating S3 bucket

Mapping train And Test Data in S3

Mapping The path of the models in S3

Training the xgboost model and deploy it.


In [None]:
## importing necessary libraries
import sagemaker, boto3
from sagemaker.session import s3_input, Session
import os
s3 = boto3.resource("s3")

In [None]:
## name bucket
bucket = "any-bucket-name"
my_region = boto3.session.Session().region_name
print(my_region)

us-east-1


In [None]:
## creating bucket
s3 = boto3.resource('s3')
try:
    s3.create_bucket(Bucket = bucket)
    print("Bucket Created")
except Exception as E:
    print("error ", E)

Bucket Created


In [None]:
## creating model output path
model_name = "xgboost"
model_output_path = f"s3://{bucket}/models/{model_name}"
print(model_name)
print(model_output_path)

xgboost
s3://crop-production-bucket/models/xgboost


### Downloading the dataset

In [None]:
import urllib , pandas as pd

try:
    url = "https://github.com/g0urav-hustler/Crop-Production-Optimization/blob/master/data/agricultural_data.csv"
    urllib.request.urlretrieve(url, "agricultural_data.csv")
    print("Data Downloaded")
except Exception as E:
    print("error -",E)

Data Downloaded


In [None]:
import pandas as pd
try:
    model_data = pd.read_csv('./agricultural_data.csv')
    print('Success: Data loaded into dataframe.')
except Exception as e:
    print('Data load error: ',e)

Success: Data loaded into dataframe.


In [None]:
## changing text categories into numbers
label_categories = model_data["Crops"].unique()
label_categories.sort()
print(label_categories)
print(len(label_categories))

['apple' 'banana' 'blackgram' 'chickpea' 'coconut' 'coffee' 'cotton'
 'grapes' 'jute' 'kidneybeans' 'lentil' 'maize' 'mango' 'mothbeans'
 'mungbean' 'muskmelon' 'orange' 'papaya' 'pigeonpeas' 'pomegranate'
 'rice' 'watermelon']
22


In [None]:
# making label dictionaries
label_dict = {}
for i in range(len(label_categories)):
    label_dict.update({label_categories[i]: i})
print(label_dict)

{'apple': 0, 'banana': 1, 'blackgram': 2, 'chickpea': 3, 'coconut': 4, 'coffee': 5, 'cotton': 6, 'grapes': 7, 'jute': 8, 'kidneybeans': 9, 'lentil': 10, 'maize': 11, 'mango': 12, 'mothbeans': 13, 'mungbean': 14, 'muskmelon': 15, 'orange': 16, 'papaya': 17, 'pigeonpeas': 18, 'pomegranate': 19, 'rice': 20, 'watermelon': 21}


In [None]:
## making labels data for further use
labels = pd.DataFrame(label_dict,index = [0])
labels.head()
labels.to_csv("labels.csv", index = False)

In [None]:
## save labels to bucket for after use
s3.Bucket(bucket).Object(f"labels_data/labels.csv").upload_file("labels.csv")

In [None]:
## applying numberical data to target column
model_data["Crops"] = model_data["Crops"].replace(label_dict)
model_data.head()

Unnamed: 0,N,P,K,Temperature,Humidity,Ph,Rainfall,Crops
0,90,42,43,20.879744,82.002744,6.502985,202.935536,20
1,85,58,41,21.770462,80.319644,7.038096,226.655537,20
2,60,55,44,23.004459,82.320763,7.840207,263.964248,20
3,74,35,40,26.491096,80.158363,6.980401,242.864034,20
4,78,42,42,20.130175,81.604873,7.628473,262.71734,20


## Mapping train And Test Data in S3

In [None]:
## split model
from sklearn.model_selection import train_test_split

x_train ,x_test, y_train, y_test = train_test_split(model_data.drop("Crops", axis = 1), model_data["Crops"], 
                                                    test_size = 0.2, random_state = 0)

In [None]:
## storing train data to bucket
pd.concat([y_train,x_train], axis = 1).to_csv("train.csv",index = False, header = False)
s3.Bucket(bucket).Object(f"train_data/train.csv").upload_file("train.csv")

In [None]:
## storing test data to bucket
pd.concat([y_test,x_test], axis = 1).to_csv("test.csv", index = False, header = False)
s3.Bucket(bucket).Object(f"test_data/test.csv").upload_file("test.csv")

In [None]:
## importing train and test data
s3_input_train = sagemaker.TrainingInput(s3_data=f's3://{bucket}/train_data', content_type='csv')
s3_input_test = sagemaker.TrainingInput(s3_data= f's3://{bucket}/test_data', content_type='csv')

## Building Model

In [None]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
from sagemaker.amazon.amazon_estimator import get_image_uri 

container=sagemaker.image_uris.retrieve("xgboost", my_region, "1.2-1")


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [None]:
# initialize hyperparameters

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"multi:softmax",
        "num_class": int(len(label_categories)),
        "num_round": 50
        }
     

In [None]:
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=model_output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
## fitting the train and test set
estimator.fit({'train': s3_input_train,'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-02-17-10-23-21-332


2023-02-17 10:23:21 Starting - Starting the training job...
2023-02-17 10:23:36 Starting - Preparing the instances for training......
2023-02-17 10:24:24 Downloading - Downloading input data...
2023-02-17 10:25:20 Training - Training image download completed. Training in progress...[34m[2023-02-17 10:25:30.216 ip-10-0-244-184.ec2.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO

## Deploying the model

In [None]:
# deploying the model
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2023-02-17-10-27-01-847
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2023-02-17-10-27-01-847
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2023-02-17-10-27-01-847


--------!

In [None]:
## predicting the test data using serializer
from sagemaker.serializers import CSVSerializer
test_data_array = x_test.values #load the data into an array
xgb_predictor.CONTENT_TYPE = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = CSVSerializer()# set the serializer type
predictions = xgb_predictor.predict(test_data_array).decode('utf-8')
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array
print(predictions_array.shape)

In [None]:
## deleting the endpoint so that service cost is stopped
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2023-02-17-10-27-01-847


## Testing the model

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test,predictions_array))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       0.95      1.00      0.97        18
           2       1.00      1.00      1.00        22
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        15
           5       0.94      1.00      0.97        17
           6       1.00      1.00      1.00        16
           7       1.00      1.00      1.00        18
           8       0.86      0.90      0.88        21
           9       1.00      1.00      1.00        20
          10       1.00      0.94      0.97        17
          11       0.95      1.00      0.97        18
          12       1.00      1.00      1.00        21
          13       0.96      1.00      0.98        25
          14       1.00      1.00      1.00        17
          15       1.00      1.00      1.00        23
          16       1.00      1.00      1.00        23
          17       1.00    

### Xgboost model gives accuracy of 98%.

## The End