<a href="https://colab.research.google.com/github/gwegayhu/dashboards-app/blob/master/ML_Model_for_Geospatial_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib

!pip install gdown
import pandas as pd
import gdown

# Download the file from Google Drive
file_id = '1NY8yDY4Zqvco83VO0_qAdUIgxHI8kQOm' # Extract the file ID from the URL
file_path = 'downloaded_file.csv'  # Local file name to save the downloaded data
gdown.download(id=file_id, output=file_path, quiet=False)

# Step 1: Load the data
# Load and preview the downloaded CSV file
data = pd.read_csv(file_path)
data.info(), data.head()

# Step 2: ETL (Extract, Transform, Load)
# Define features and target variable
X = data.drop(columns=['Farm_ID', 'Yield(tons)'])
y = data['Yield(tons)']

# Identify categorical and numerical features
categorical_features = ['Crop_Type', 'Irrigation_Type', 'Soil_Type', 'Season']
numerical_features = ['Farm_Area(acres)', 'Fertilizer_Used(tons)', 'Pesticide_Used(kg)', 'Water_Usage(cubic meters)']

# Step 3: Feature Engineering
# Preprocessing for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Step 4: Model Training Pipeline
# Define the model
model = RandomForestRegressor(random_state=42)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
pipeline.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Step 8: Deploy the model
# Save the trained pipeline
# Changed the path to the current working directory.
model_path = 'agriculture_yield_model.pkl'  # Changed to a local file path
joblib.dump(pipeline, model_path)
print(f"Model saved to {model_path}")



Downloading...
From: https://drive.google.com/uc?id=1NY8yDY4Zqvco83VO0_qAdUIgxHI8kQOm
To: /content/downloaded_file.csv
100%|██████████| 3.23k/3.23k [00:00<00:00, 9.24MB/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Farm_ID                    50 non-null     object 
 1   Crop_Type                  50 non-null     object 
 2   Farm_Area(acres)           50 non-null     float64
 3   Irrigation_Type            50 non-null     object 
 4   Fertilizer_Used(tons)      50 non-null     float64
 5   Pesticide_Used(kg)         50 non-null     float64
 6   Yield(tons)                50 non-null     float64
 7   Soil_Type                  50 non-null     object 
 8   Season                     50 non-null     object 
 9   Water_Usage(cubic meters)  50 non-null     float64
dtypes: float64(5), object(5)
memory usage: 4.0+ KB
Mean Squared Error: 153.8028708549999
Model saved to agriculture_yield_model.pkl
