#### Notebook to develop the predictive model

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import mean_squared_error

import xgboost as xgb

import joblib

#### First load in the data and split into train and test set

In [110]:
# Part 1: Load in the data
df = pd.read_csv("../data/dummy_data.csv")

# Part 2: Split the data into features and target
X = df.drop(['optimal_fertilizer_amount'], axis=1)
y = df['optimal_fertilizer_amount']

# Part 3: Split the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

#### Then define steps to preprocess the data using sklearn's Pipeline

In [111]:
# Part 4: Define the list numeric, nominal and ordinal features
numeric_features_list = X_train.select_dtypes(include='number').columns.tolist()
ordinal_features_list = ['plant_health', 'soil_color'] # ordinal means there is an order to the feature where healthy > yellowing	> wilting
nominal_features_list = ['crop_type']    # nominal means there is no order to the feature

# Part 5: Build a preprocessing step for numeric features
numeric_transformer = Pipeline(steps=[

    # Handling missing values by using median imputation as it would be less sensitive to outliers
    ('imputer', SimpleImputer(strategy='median')),

    # scale the numeric features to handle outliers
    ('scaler', RobustScaler())
    
])

# Part 6: define the preprocessing steps for nominal features
nominal_transformer = Pipeline(steps=[
    
    # convert categorical columns to numeric columns
    ('one_hot_encoding', OneHotEncoder(handle_unknown='ignore'))                       
])

# Part 7: Define the preprocessing steps for ordinal features
ordinal_transformer = Pipeline(steps=[
    
    # Convert categorical features to ranked numerical featuere
    ('ordinal_encoding', OrdinalEncoder())
])

# Part 8: Combine the preprocessing steps
preprocessor = ColumnTransformer(transformers=[
    
    ('numerical', numeric_transformer, numeric_features_list),
    ('nominal', nominal_transformer, nominal_features_list),
    ('ordinal', ordinal_transformer, ordinal_features_list)

])

# Part 9: Define and create the pipeline with XGBoost
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(n_estimators=100, random_state=42))
])

model

#### Train and evaluate the model

In [113]:
# Fit the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# And evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

RMSE: 4.27734646202134


#### And finally make a test prediction using one row of data

In [106]:
conditions = pd.DataFrame([{
    'soil_color': 'light brown',
    'soil_ph': 6.5,
    'soil_n': 30.0,
    'soil_p': 40.0,
    'temp': 28.0,
    'rainfall': 120.0,
    'forecast_temp': 31.0,
    'forecast_rainfall': 100.0,
    'crop_type': 'corn',
    'plant_health': 'healthy'
    }])

predicted_fertilizer_amount = model.predict(conditions)
print(f"Predicted Optimal Fertilizer Amount: {predicted_fertilizer_amount[0]}")

Predicted Optimal Fertilizer Amount: 134.68853759765625


#### Then save the model as an artefact

In [103]:
# Save the model 
joblib.dump(model, "../model/model_pipeline.joblib")

['../model/model_pipeline.joblib']