In [21]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Step 2: Load the data
file_path = 'sample_agriculture_data.csv'  # Path to the CSV file
data = pd.read_csv(file_path)

# Display the first few rows
print(data.head())

   Year       Region  Temperature  Precipitation  Soil_pH  Irrigation  \
0  2005      Central         20.6            710      6.3         118   
1  2005  Rift Valley         20.6            690      6.3         119   
2  2005      Eastern         20.3            650      6.3         135   
3  2005      Western         20.3            650      6.3         127   
4  2005       Nyanza         20.3            670      6.3         148   

   Crop_Type  Historical_Yield  
0          1               3.1  
1          1               3.1  
2          1               3.1  
3          1               3.1  
4          1               3.1  


In [26]:
label_encoder = LabelEncoder()
data['Region'] = label_encoder.fit_transform(data['Region'])

# Check the transformed data
print(data[['Region']].head())

   Region
0       0
1       4
2       1
3       5
4       3


In [27]:
X = data[['Year', 'Region', 'Temperature', 'Precipitation', 'Soil_pH', 'Irrigation', 'Crop_Type']]
y = data['Historical_Yield']

# Display the feature set and target
print(X.head())
print(y.head())

   Year  Region  Temperature  Precipitation  Soil_pH  Irrigation  Crop_Type
0  2005       0         20.6            710      6.3         118          1
1  2005       4         20.6            690      6.3         119          1
2  2005       1         20.3            650      6.3         135          1
3  2005       5         20.3            650      6.3         127          1
4  2005       3         20.3            670      6.3         148          1
0    3.1
1    3.1
2    3.1
3    3.1
4    3.1
Name: Historical_Yield, dtype: float64


In [28]:
# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes of the splits
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (91, 7)
Testing set size: (23, 7)


In [29]:
# Step 6: Train the model (using Linear Regression)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Confirm model training
print("Model trained successfully!")


Model trained successfully!


In [10]:
# Step 7: Make predictions
y_pred = model.predict(X_test)

# Output the first few predictions
print("Predicted Values (y_pred):\n", y_pred[:5], "\n")

Predicted Values (y_pred):
 [3.24464746 3.31384847 3.46810816 3.71491724] 



In [31]:
# Step 8: Evaluate the model
y_pred = model.predict(X_test)

# Calculate and display evaluation metrics
print("R² Score:", r2_score(y_test, y_pred))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


R² Score: 0.7828827380952349
Mean Squared Error: 0.01241139130434801


In [35]:
# Step 9: Prepare the input data for 2024 and 2025
future_years = pd.DataFrame({
    'Year': [2024, 2024, 2024, 2024, 2024, 2024, 2025, 2025, 2025, 2025, 2025, 2025],
    'Region': list(range(6)) * 2, 
    'Temperature': [22.5] * 12, 
    'Precipitation': [700] * 12, 
    'Soil_pH': [6.5] * 12,  
    'Irrigation': [150] * 12, 
    'Crop_Type': [1] * 12 
})

# Predict yields
future_predictions = model.predict(future_years)
future_years['Predicted_Yield'] = future_predictions

print(future_years)


    Year  Region  Temperature  Precipitation  Soil_pH  Irrigation  Crop_Type  \
0   2024       0         22.5            700      6.5         150          1   
1   2024       1         22.5            700      6.5         150          1   
2   2024       2         22.5            700      6.5         150          1   
3   2024       3         22.5            700      6.5         150          1   
4   2024       4         22.5            700      6.5         150          1   
5   2024       5         22.5            700      6.5         150          1   
6   2025       0         22.5            700      6.5         150          1   
7   2025       1         22.5            700      6.5         150          1   
8   2025       2         22.5            700      6.5         150          1   
9   2025       3         22.5            700      6.5         150          1   
10  2025       4         22.5            700      6.5         150          1   
11  2025       5         22.5           

In [36]:
# Decode the Region back to its original names
future_years['Region'] = label_encoder.inverse_transform(future_years['Region'])

# Display the final DataFrame
print(future_years)


    Year  Region  Temperature  Precipitation  Soil_pH  Irrigation  Crop_Type  \
0   2024       0         22.5            700      6.5         150          1   
1   2024       1         22.5            700      6.5         150          1   
2   2024       2         22.5            700      6.5         150          1   
3   2024       3         22.5            700      6.5         150          1   
4   2024       4         22.5            700      6.5         150          1   
5   2024       5         22.5            700      6.5         150          1   
6   2025       0         22.5            700      6.5         150          1   
7   2025       1         22.5            700      6.5         150          1   
8   2025       2         22.5            700      6.5         150          1   
9   2025       3         22.5            700      6.5         150          1   
10  2025       4         22.5            700      6.5         150          1   
11  2025       5         22.5           