
# 🌱 Predicting Biofuel Production from Agricultural Waste

This notebook demonstrates a **simple AI pipeline** to estimate biofuel energy potential from crop production data.

Steps:
1. Download a dataset from **Kaggle** (Crop Production in India – 1997–2021).
2. Preprocess the dataset (clean columns, handle missing values).
3. Compute **Residue-to-Product Ratio (RPR)** × **Calorific Value (CV)** → Energy (GJ).
4. Train a **RandomForest Regressor** to predict biofuel potential.
5. Evaluate the model.

> ⚠️ Make sure you have a Kaggle API token (`kaggle.json`) placed in the correct folder before running.


In [1]:

# Install Kaggle API if not already installed
#!pip install -q kaggle

#import os, zipfile

# Dataset: India Crop Production (1997–2021)
#DATASET = "mohansachdeva/india-crop-production-1997-2021"

#os.makedirs("data", exist_ok=True)
#!kaggle datasets download -d {DATASET} -p data

# Unzip files
#for fn in os.listdir("data"):
   # if fn.endswith(".zip"):
        # with zipfile.ZipFile(os.path.join("data", fn), "r") as z:
        #     z.extractall("data")
        # os.remove(os.path.join("data", fn))

#os.listdir("data")


In [2]:

import pandas as pd
import numpy as np

# Load dataset (pick first CSV file)
#files = [f for f in os.listdir("data") if f.endswith(".csv")]
df = pd.read_csv(r"crop_production.csv")
df


Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0
...,...,...,...,...,...,...,...
246086,West Bengal,PURULIA,2014,Summer,Rice,306.0,801.0
246087,West Bengal,PURULIA,2014,Summer,Sesamum,627.0,463.0
246088,West Bengal,PURULIA,2014,Whole Year,Sugarcane,324.0,16250.0
246089,West Bengal,PURULIA,2014,Winter,Rice,279151.0,597899.0


In [3]:
df.head()

Unnamed: 0,State_Name,District_Name,Crop_Year,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,2000,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,2000,Whole Year,Cashewnut,720.0,165.0


In [4]:
median_value = df['Production'].median()
df['Production'].fillna(median_value, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Production'].fillna(median_value, inplace=True)


In [5]:
df.isna().sum()

State_Name       0
District_Name    0
Crop_Year        0
Season           0
Crop             0
Area             0
Production       0
dtype: int64

In [6]:

# Standardize column names
df.columns = [c.strip().replace(" ", "_") for c in df.columns]

rename_map = {
    "state_name": "State_Name",
    "district_name": "District_Name",
    "season": "Season",
    "crop": "Crop",
    "area": "Area",
    "production": "Production"
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns.str.lower().tolist()})

needed = ["State_Name","District_Name","Season","Crop","Area","Production"]
df = df[[c for c in df.columns if c in needed]].copy()

# Handle numeric
df["Area"] = pd.to_numeric(df["Area"], errors="coerce")
df["Production"] = pd.to_numeric(df["Production"], errors="coerce")
df = df.dropna(subset=["Crop","Area","Production"]).reset_index(drop=True)

print(df.shape)
df.head()


(246091, 6)


Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production
0,Andaman and Nicobar Islands,NICOBARS,Kharif,Arecanut,1254.0,2000.0
1,Andaman and Nicobar Islands,NICOBARS,Kharif,Other Kharif pulses,2.0,1.0
2,Andaman and Nicobar Islands,NICOBARS,Kharif,Rice,102.0,321.0
3,Andaman and Nicobar Islands,NICOBARS,Whole Year,Banana,176.0,641.0
4,Andaman and Nicobar Islands,NICOBARS,Whole Year,Cashewnut,720.0,165.0


In [7]:

# Residue-to-Product Ratio (RPR) and Calorific Value (MJ/kg)
RPR_CV = {
    "Sugarcane": {"rpr": 0.28, "cv": 18.5},
    "Rice": {"rpr": 0.22, "cv": 14.5},
    "Wheat": {"rpr": 1.40, "cv": 17.5},
    "Maize": {"rpr": 1.00, "cv": 18.0},
    "Cotton": {"rpr": 2.00, "cv": 16.5},
    "Soybean": {"rpr": 1.50, "cv": 17.2},
}

def compute_energy(row):
    crop = str(row["Crop"]).strip().title()
    crop = {"Paddy":"Rice","Soyabean":"Soybean"}.get(crop, crop)
    rpr = RPR_CV.get(crop, {}).get("rpr", 1.0)
    cv = RPR_CV.get(crop, {}).get("cv", 17.0)
    residue_tonnes = row["Production"] * rpr
    energy_gj = residue_tonnes * 1000 * cv / 1000  # tonnes→kg, MJ/kg → MJ, /1000 → GJ
    return residue_tonnes, energy_gj

df[["Residue_tonnes","Energy_GJ"]] = df.apply(lambda r: compute_energy(r), axis=1, result_type="expand")
df.head()


Unnamed: 0,State_Name,District_Name,Season,Crop,Area,Production,Residue_tonnes,Energy_GJ
0,Andaman and Nicobar Islands,NICOBARS,Kharif,Arecanut,1254.0,2000.0,2000.0,34000.0
1,Andaman and Nicobar Islands,NICOBARS,Kharif,Other Kharif pulses,2.0,1.0,1.0,17.0
2,Andaman and Nicobar Islands,NICOBARS,Kharif,Rice,102.0,321.0,70.62,1023.99
3,Andaman and Nicobar Islands,NICOBARS,Whole Year,Banana,176.0,641.0,641.0,10897.0
4,Andaman and Nicobar Islands,NICOBARS,Whole Year,Cashewnut,720.0,165.0,165.0,2805.0


In [8]:

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, r2_score

# # Features and target
# target = "Energy_GJ"
# y = df[target]

# cat_cols = ["State_Name","District_Name","Season","Crop"]
# num_cols = ["Area","Production"]
# X = df[cat_cols + num_cols]

# # Preprocess (encode categoricals)
# pre = ColumnTransformer([
#     ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
#     ("num", "passthrough", num_cols)
# ])

# # Model pipeline
# pipe = Pipeline([
#     ("prep", pre),
#     ("rf", RandomForestRegressor(n_estimators=50, random_state=42))
# ])
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# # Features and target
# target = "Energy_GJ"
# y = df[target]

# cat_cols = ["State_Name", "District_Name", "Season", "Crop"]
# num_cols = ["Area", "Production"]
# X = df[cat_cols + num_cols]

# -----------------------------
# Preprocess (with imputers)
# -----------------------------
# pre = ColumnTransformer([
#     ("cat", Pipeline([
#         ("imputer", SimpleImputer(strategy="most_frequent")),   # handle NaN in categoricals
#         ("encoder", OneHotEncoder(handle_unknown="ignore"))
#     ]), cat_cols),
#     ("num", Pipeline([
#         ("imputer", SimpleImputer(strategy="median"))           # handle NaN in numericals
#     ]), num_cols)
# ])

# # -----------------------------
# # Model pipeline
# # -----------------------------
# pipe = Pipeline([
#     ("prep", pre),
#     ("rf", RandomForestRegressor(n_estimators=50, random_state=42))
# ])





In [9]:
# Features and target
target = "Energy_GJ"
y = df[target]

cat_cols = ["State_Name", "District_Name", "Season", "Crop"]
num_cols = ["Area", "Production"]
X = df[cat_cols + num_cols]


In [10]:
# -----------------------------
# Preprocess (with imputers)
# -----------------------------
pre = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),   # handle NaN in categoricals
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_cols),
    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median"))           # handle NaN in numericals
    ]), num_cols)
])

In [11]:
# # -----------------------------
# # Model pipeline
# # -----------------------------
# pipe = Pipeline([
#     ("prep", pre),
#     ("rf", RandomForestRegressor(n_estimators=50, random_state=42))
# ])
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("prep", pre),  # your preprocessing
    ("rf", RandomForestRegressor(
        n_estimators=50,  # reduce number of trees
        max_depth=10,     # limit depth of trees
        random_state=42
    ))
])

In [12]:
# Train/test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# -----------------------------
# Fit model  ✅ this was missing
# -----------------------------
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('prep', ...), ('rf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,n_estimators,50
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
# Predictions & evaluation
pred = pipe.predict(X_test)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print("MAE (GJ):", mae)
print("R²:", r2)

MAE (GJ): 91162.4286186079
R²: 0.9999233373728348


In [15]:

# import joblib, os, json

# os.makedirs("models", exist_ok=True)
# joblib.dump(pipe, "models/biofuel_model.joblib")

# meta = {
#     "categorical_features": cat_cols,
#     "numerical_features": num_cols,
#     "target": target
# }
# with open("models/metadata.json", "w") as f:
#     json.dump(meta, f, indent=2)

# print("✅ Model and metadata saved in 'models/' folder")


In [16]:
# import joblib
# import os
# import json

# # Specify the folder where you want to save your files
# SAVE_DIR = r"D:\skill4future\project sub\final project biofuel\saved_models"  # replace with your path
# os.makedirs(SAVE_DIR, exist_ok=True)  # create the folder if it doesn't exist

# # Save the model
# model_path = os.path.join(SAVE_DIR, "biofuel_model.joblib")
# joblib.dump(pipe, model_path)

# # Save metadata
# meta = {
#     "categorical_features": cat_cols,
#     "numerical_features": num_cols,
#     "target": target
# }
# metadata_path = os.path.join(SAVE_DIR, "metadata.json")
# with open(metadata_path, "w") as f:
#     json.dump(meta, f, indent=2)

# print(f"✅ Model and metadata saved in '{SAVE_DIR}'")


import joblib
import os
import json

# Use a relative folder in your project
SAVE_DIR = "saved_models"
os.makedirs(SAVE_DIR, exist_ok=True)  # create folder if it doesn't exist

# Save the trained model
model_path = os.path.join(SAVE_DIR, "biofuel_model.joblib")
joblib.dump(pipe, model_path)

# Save metadata
meta = {
    "categorical_features": cat_cols,
    "numerical_features": num_cols,
    "target": target
}
metadata_path = os.path.join(SAVE_DIR, "metadata.json")
with open(metadata_path, "w") as f:
    json.dump(meta, f, indent=2)

print(f"✅ Model and metadata saved in '{SAVE_DIR}'")



✅ Model and metadata saved in 'saved_models'
