## Install and Import XGBoost Library

In [None]:
import sys
!{sys.executable} -m pip install xgboost

In [None]:
import xgboost
print(xgboost.__version__)

# Pandas is used for data manipulation
import pandas as pd
import numpy as np
import sklearn
from numpy import absolute
from pandas import read_csv
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split

#### Data and Preparation

In [5]:

# Read in data as pandas dataframe and display first 5 rows
features = pd.read_csv('final-data-edited-nozero.csv')
features.head(5)

Unnamed: 0,State,Average_avg-Temp,Average-Min Temp,Average-max-temp,avg-precipitation,avg-windSpeed,pH,Clay,Sand,Silt,Crop Yield,Hectare
0,Plateau,21.258682,15.489182,27.02874,107.791667,2.258682,5.833333,29.5,43.166667,25.833333,2.603023,1.686768
1,Taraba,21.287979,15.765146,26.809448,136.270833,1.733292,5.4,37.833333,35.833333,25.0,2.389935,2.622656
2,Plateau,21.341573,16.420537,27.24726,103.395833,2.25851,6.2,27.5,44.5,29.333333,2.603023,1.686768
3,Plateau,21.474099,15.814495,27.124927,111.145833,2.237911,5.3,30.833333,43.666667,23.833333,2.603023,1.686768
4,Plateau,21.523187,15.900604,27.139937,101.75,2.40501,5.85,30.0,48.166667,23.833333,2.603023,1.686768


In [7]:
# One-hot encode categorical features
features = pd.get_dummies(features)
features.replace({False: 0, True: 1}, inplace=True)
features.head(5)

Unnamed: 0,Average_avg-Temp,Average-Min Temp,Average-max-temp,avg-precipitation,avg-windSpeed,pH,Clay,Sand,Silt,Crop Yield,...,State_Kebbi,State_Kwara,State_Lagos,State_Ogun,State_Ondo,State_Osun,State_Oyo,State_Plateau,State_Rivers,State_Taraba
0,21.258682,15.489182,27.02874,107.791667,2.258682,5.833333,29.5,43.166667,25.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0
1,21.287979,15.765146,26.809448,136.270833,1.733292,5.4,37.833333,35.833333,25.0,2.389935,...,0,0,0,0,0,0,0,0,0,1
2,21.341573,16.420537,27.24726,103.395833,2.25851,6.2,27.5,44.5,29.333333,2.603023,...,0,0,0,0,0,0,0,1,0,0
3,21.474099,15.814495,27.124927,111.145833,2.237911,5.3,30.833333,43.666667,23.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0
4,21.523187,15.900604,27.139937,101.75,2.40501,5.85,30.0,48.166667,23.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0


In [8]:
features2 = features.drop_duplicates()
features2.head(5)

Unnamed: 0,Average_avg-Temp,Average-Min Temp,Average-max-temp,avg-precipitation,avg-windSpeed,pH,Clay,Sand,Silt,Crop Yield,...,State_Kebbi,State_Kwara,State_Lagos,State_Ogun,State_Ondo,State_Osun,State_Oyo,State_Plateau,State_Rivers,State_Taraba
0,21.258682,15.489182,27.02874,107.791667,2.258682,5.833333,29.5,43.166667,25.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0
1,21.287979,15.765146,26.809448,136.270833,1.733292,5.4,37.833333,35.833333,25.0,2.389935,...,0,0,0,0,0,0,0,0,0,1
2,21.341573,16.420537,27.24726,103.395833,2.25851,6.2,27.5,44.5,29.333333,2.603023,...,0,0,0,0,0,0,0,1,0,0
3,21.474099,15.814495,27.124927,111.145833,2.237911,5.3,30.833333,43.666667,23.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0
4,21.523187,15.900604,27.139937,101.75,2.40501,5.85,30.0,48.166667,23.833333,2.603023,...,0,0,0,0,0,0,0,1,0,0


In [11]:
# Drop duplicate rows
features2 = features.drop_duplicates()
#Drop features not included after feature selection

features2= features2.drop(['Average_avg-Temp', 'Average-max-temp', 'Clay'], axis = 1)
# Use numpy to convert to arrays
import numpy as np

# Labels are the values we want to predict
labels = np.array(features2['Crop Yield'])

# Remove the labels from the features
# axis 1 refers to the columns
data= features2.drop('Crop Yield', axis = 1)

# Saving feature names for later use
data_list = list(data.columns)

# Convert to numpy array
data = np.array(data)

#### Data Split

In [12]:
# Split the data into training, validation, and testing sets
train_features, val_test_features, train_labels, val_test_labels = train_test_split(data, labels, test_size = 0.3,
                                                                           shuffle= True, random_state = 0)
X_val, X_test, Y_val, Y_test = train_test_split(val_test_features, val_test_labels, test_size=0.5)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', Y_test.shape)
print('Validation Features Shape:', X_val.shape)
print('Validation Label Shape:', Y_val.shape)

Training Features Shape: (1142, 30)
Training Labels Shape: (1142,)
Testing Features Shape: (245, 30)
Testing Labels Shape: (245,)
Validation Features Shape: (245, 30)
Validation Label Shape: (245,)


#### Model

In [104]:
# RF Regression Model model 
model = XGBRegressor(max_depth = 10, subsample=1, n_estimators = 900,learning_rate=0.1)
model.fit(train_features, train_labels)

In [105]:
# evaluate an xgboost regression model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(model, X_test, Y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.6f (%.6f)' % (scores.mean(), scores.std()) )

Mean MAE: 0.008391 (0.009020)


In [106]:
# Use the forest's predict method on the test data
predictions = model.predict(X_test)


In [107]:
from sklearn.metrics import mean_squared_error
# Mean Squared Error (MSE)
mse = mean_squared_error(Y_test, predictions)
print('Mean Squared Error (MSE): ', mse)

Mean Squared Error (MSE):  3.391448662987479e-05


In [112]:
from math import sqrt
rmse = sqrt(mse)
print('Root Mean Absolute Error:', rmse, 'degrees.')

Root Mean Absolute Error: 0.005823614567420718 degrees.


In [108]:
from sklearn.metrics import mean_absolute_percentage_error
# Mean Squared Error (MSE)
mape = mean_absolute_percentage_error(Y_test, predictions)
print('mean absolute percentage error (MAPE): ', mape)

mean absolute percentage error (MAPE):  0.0010763127768356965


In [109]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(Y_test, predictions)
print('Mean Absolute Error (MAE): ', mae)

Mean Absolute Error (MAE):  0.00043220376695762613


#### Save Model

In [21]:
import pickle
filename = 'XG_model.dat'
pickle.dump(model, open(filename, "wb"))


In [11]:
import pickle
model = pickle.load(open("XG_model.dat", "rb"))

#### Testing

#### Generalisation to unseen set

In [26]:
tt = np.array(X_test.reshape(245,30))
pred = model.predict(tt)
pred = pd.DataFrame(pred)
yact = pd.DataFrame(Y_test)
all = pd.concat([pred, yact], axis=1)


all.columns = ["y_pred", "y_actual"]
all.to_csv('predictedXG&actual.csv')

#### Generalisation to Unforeseen set

###### Change to precipitation

In [12]:
tf = np.array([ 21.69209,   13.52083,     1.4988489,   5.4666667,  59.833332,   10.166667,   0.5454889,   0. ,         0.,          0.,          0.,          0.,   0.,          0.,          0.,          0.,          0.,          0.,1.,          0.,          0.,          0.,          0.,          0.,  0.,          0.,          0.,          0.,          0. ,         0. ])
model.predict(np.array(tf).reshape(1,30))

array([0.7093763], dtype=float32)

In [15]:
tf2 = np.array([16.685463 , 9.125 ,     2.417177,   5.5666666 ,35.5   ,    27.333334,  1.6867675,  0.  ,       0.    ,     0. ,        0.         ,0.,  0.  ,       0.   ,      0.   ,      0.   ,      0.   ,      0.,  0.  ,       0.   ,      0.    ,     0.    ,     0.    ,     0.,  0.   ,      0.   ,      0.  ,       1.    ,     0.  ,       0.       ])
model.predict(np.array(tf2).reshape(1,30))

array([2.6028855], dtype=float32)

###### change silt

In [14]:
tf = np.array([ 21.69209,   133.52083,     1.4988489,   5.4666667,  59.833332,   29.166667,   0.5454889,   0. ,         0.,          0.,          0.,          0.,   0.,          0.,          0.,          0.,          0.,          0.,1.,          0.,          0.,          0.,          0.,          0.,  0.,          0.,          0.,          0.,          0. ,         0. ])
model.predict(np.array(tf).reshape(1,30))

array([0.7093763], dtype=float32)

In [16]:
tf2 = np.array([16.685463 , 99.125 ,     2.417177,   5.5666666 ,35.5   ,    50.333334,  1.6867675,  0.  ,       0.    ,     0. ,        0.         ,0.,  0.  ,       0.   ,      0.   ,      0.   ,      0.   ,      0.,  0.  ,       0.   ,      0.    ,     0.    ,     0.    ,     0.,  0.   ,      0.   ,      0.  ,       1.    ,     0.  ,       0.       ])
model.predict(np.array(tf2).reshape(1,30))

array([2.6028855], dtype=float32)