In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

The crop dataset is read after that the location column is label encoded for the model

In [56]:

df = pd.read_csv('cropStats.csv')
df = df.iloc[:, 1:-1]  # Selects all rows, and columns from index 1 to the second-to-last column


label_encoder = LabelEncoder()
df['Location_LabelEncoded'] = label_encoder.fit_transform(df['Location'])

# Dropping the original 'Location' column
df = df.drop('Location', axis=1)

df.insert(loc=0, column='Location_LabelEncoded', value=df.pop('Location_LabelEncoded'))
df



Unnamed: 0,Location_LabelEncoded,1900,1901,1902,1903,1904,1905,1906,1907,1908,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,0,1064308.4,1044074.4,1084542.4,1088589.2,1052168.0,1072402.0,1048121.2,1011700.0,1027887.2,...,95099.8,101170.0,101170.0,101170.0,119380.6,119380.6,115333.8,99146.6,127474.2,95099.8
1,1,4451.48,4856.16,4046.8,4856.16,4046.8,5260.84,5260.84,6070.2,6474.88,...,6070.2,8093.6,8902.96,14163.8,12949.76,20638.68,11331.04,14568.48,20234.0,12949.76
2,2,922670.4,886249.2,906483.2,870062.0,837687.6,821500.4,821500.4,849828.0,866015.2,...,174012.4,165918.8,153778.4,210433.6,281252.6,352071.6,214480.4,180082.6,301486.6,240784.6
3,3,25090.16,25494.84,25899.52,25090.16,23876.12,24685.48,25090.16,24280.8,23471.44,...,68795.6,64748.8,72842.4,60702.0,72842.4,72842.4,38444.6,24280.8,40468.0,32374.4
4,4,38039.92,44110.12,53013.08,57869.24,66367.52,72842.4,78912.6,84982.8,105216.8,...,408726.8,400633.2,489662.8,526084.0,408726.8,396586.4,408726.8,384446.0,473475.6,526084.0
5,5,78912.6,78103.24,77698.56,80126.64,78912.6,76889.2,76889.2,76889.2,76889.2,...,61511.36,65962.84,70009.64,73651.76,72033.04,70414.32,67986.24,66367.52,66367.52,69200.28
6,6,232691.0,230667.6,238761.2,240784.6,240784.6,248878.2,246854.8,240784.6,234714.4,...,14163.8,14973.16,10117.0,13354.44,16187.2,31565.04,16187.2,20234.0,16187.2,14973.16
7,7,1444707.6,1388052.4,1388052.4,1363771.6,1339490.8,1408286.4,1379958.8,1367818.4,1355678.0,...,125450.8,149731.6,99146.6,109263.6,125450.8,188176.2,125450.8,115333.8,137591.2,99146.6
8,8,4046.8,4046.8,3642.12,4046.8,3237.44,3642.12,3642.12,4046.8,4046.8,...,32374.4,32374.4,44514.8,48561.6,54631.8,46538.2,32374.4,28327.6,40468.0,46538.2
9,9,4232952.8,4330076.0,4390778.0,4269374.0,4249140.0,4249140.0,4249140.0,4208672.0,3965864.0,...,4815692.0,4775224.0,5018032.0,4997798.0,4957330.0,4775224.0,4754990.0,4653820.0,4633586.0,4431246.0


The data set is split into features X and Y variables, 2017 being the label = "Y" and the rest which are the features = "X"

In [57]:
X = df.drop('2017', axis=1) 
y = df['2017']


The data is then split into a training set and testing set, as you can see from the shape test and training sets are in their right proportions

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32, 118) (9, 118) (32,) (9,)


An instance of the the regression model is created and applied to the training data

In [59]:
reg = LinearRegression().fit(X_train, y_train)

The model is now evaluated on test data, the values for gthe test set are predicted, the Mean Squared error is calculated and the r2 score is calculated.

In [60]:
# Imports the metrics from scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

# Predicts the target values for the test set
y_pred = reg.predict(X_test)

# Calculates the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Calculates the R^2 score
r2 = r2_score(y_test, y_pred)

# Prints the results
print('Mean squared error:', mse)
print('R^2 score:', r2)


Mean squared error: 2452486252.694315
R^2 score: 0.9884000065100695


Another way of calculating the r2 score, this is done by taking the rest data and target values as arguments to the pretrained model

In [61]:
# Gets the R^2 score for the test set
r2 = reg.score(X_test, y_test)

# Prints the result
print('R^2 score:', r2)


R^2 score: 0.9884000065100695


Both approaches are meant to yield the same R^2 score when used with the same test set and model, and their objective is to assess how well the model predicts the given test data.
Since both approaches use the same inputs to conduct the calculations, the R^2 score should be the same as long as the same data (X_test and y_test) and model (reg) are used.

In [62]:
import pickle
# saves the model to disk
filename = 'SKLCropModel.pkl'
pickle.dump(reg, open('saved_models/sci_kit_crop_model.pkl', 'wb'))
