In [None]:
# imports and constants
import os
import zipfile
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #handles missing data (using a strategy)
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import requests
from io import BytesIO

DATA_DIR = Path("data") #PATH object gets created
DATA_DIR.mkdir(exist_ok=True) #Creates the folder if it doesn’t exist.
VEH_ZIP_URL = "https://www.fueleconomy.gov/feg/epadata/vehicles.csv.zip"
EMISS_ZIP_URL = "https://www.fueleconomy.gov/feg/epadata/emissions.csv.zip"


In [None]:
# download & extraction
def download_and_extract(url, output_dir=DATA_DIR):
    local_zip = output_dir / Path(url).name
    if not local_zip.exists():
        print(f"Downloading {url} ...")
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        local_zip.write_bytes(r.content)
    else:
        print(f"{local_zip} already exists, skipping download.")
    with zipfile.ZipFile(local_zip, "r") as z:
        z.extractall(output_dir)
    print(f"Extracted {local_zip} to {output_dir}")

download_and_extract(VEH_ZIP_URL)
download_and_extract(EMISS_ZIP_URL)


data\vehicles.csv.zip already exists, skipping download.
Extracted data\vehicles.csv.zip to data
data\emissions.csv.zip already exists, skipping download.
Extracted data\emissions.csv.zip to data


In [None]:
# loading CSVs
vehicles_csv = DATA_DIR / "vehicles.csv"
emissions_csv = DATA_DIR / "emissions.csv"

df_veh = pd.read_csv(vehicles_csv, low_memory=False)
df_em = pd.read_csv(emissions_csv, low_memory=False)

print("vehicles:", df_veh.shape)
print("emissions:", df_em.shape)


vehicles: (49529, 84)
emissions: (53022, 8)


In [None]:
# checking columns and rows
display(df_veh.head())
display(df_em.head())
print("veh columns:", list(df_veh.columns)[:40])
print("em columns:", list(df_em.columns)[:40])


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,14.167143,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,27.046364,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,11.018889,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,27.046364,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,15.658421,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


Unnamed: 0,efid,id,salesArea,score,scoreAlt,smartwayScore,standard,stdText
0,4HNXV03.2MJE,19332,3,1.0,-1.0,-1,B9,BIN 9
1,4HNXV03.2MJE,19332,7,1.0,-1.0,-1,L1,LEV
2,4HNXV03.2MJE,19333,3,1.0,-1.0,-1,B9,BIN 9
3,4HNXV03.2MJE,19333,7,1.0,-1.0,-1,L1,LEV
4,4ADXV01.8346,19334,3,1.0,-1.0,-1,B9,BIN 9


veh columns: ['barrels08', 'barrelsA08', 'charge120', 'charge240', 'city08', 'city08U', 'cityA08', 'cityA08U', 'cityCD', 'cityE', 'cityUF', 'co2', 'co2A', 'co2TailpipeAGpm', 'co2TailpipeGpm', 'comb08', 'comb08U', 'combA08', 'combA08U', 'combE', 'combinedCD', 'combinedUF', 'cylinders', 'displ', 'drive', 'engId', 'eng_dscr', 'feScore', 'fuelCost08', 'fuelCostA08', 'fuelType', 'fuelType1', 'ghgScore', 'ghgScoreA', 'highway08', 'highway08U', 'highwayA08', 'highwayA08U', 'highwayCD', 'highwayE']
em columns: ['efid', 'id', 'salesArea', 'score', 'scoreAlt', 'smartwayScore', 'standard', 'stdText']


In [37]:
# merging datasets

common = set(df_veh.columns).intersection(df_em.columns)
print("Common columns:", common)
# id is common one
df = df_em.merge(df_veh, on="id", how="left", suffixes=("_em", "_veh"))

print("Merged shape:", df.shape)

display(df[['id', 'co2', 'co2TailpipeGpm']].head())


Common columns: {'id'}
Merged shape: (53022, 91)


Unnamed: 0,id,co2,co2TailpipeGpm
0,19332,-1.0,493.722222
1,19332,-1.0,493.722222
2,19333,-1.0,493.722222
3,19333,-1.0,493.722222
4,19334,-1.0,423.190476


In [None]:
# defines what model is predicting joh target variable hai
if 'co2' in df.columns:
    target_col = 'co2'
elif 'co2TailpipeGpm' in df.columns:
    target_col = 'co2TailpipeGpm'  # fallback
else:
    raise KeyError("Couldn't find a CO2 target column. Inspect the merged dataframe columns.")

# numerical candidates
num_feats = ['year', 'displ', 'cylinders', 'city08', 'highway08', 'comb08', 'barrels08']
num_feats = [c for c in num_feats if c in df.columns]

# categorical candidates (small cardinality preferable)
cat_feats = ['make', 'model', 'trany', 'VClass', 'fuelType']
cat_feats = [c for c in cat_feats if c in df.columns]

print("Target:", target_col)
print("Numerical:", num_feats)
print("Categorical:", cat_feats)


Target: co2
Numerical: ['year', 'displ', 'cylinders', 'city08', 'highway08', 'comb08', 'barrels08']
Categorical: ['make', 'model', 'trany', 'VClass', 'fuelType']


In [None]:
selected = num_feats + cat_feats + [target_col] #combines all this in a single list
data = df[selected].copy() # creating a deep copy
print("Before dropna:", data.shape)
# Drop rows where target is missing
data = data[~data[target_col].isna()].reset_index(drop=True)
print("After removing missing target:", data.shape)
# drop rows with missing numerically features
data = data.dropna(subset=num_feats, how="all").reset_index(drop=True)
print("Final dataset shape:", data.shape)
display(data.head())


Before dropna: (53022, 13)
After removing missing target: (52935, 13)
Final dataset shape: (52935, 13)


Unnamed: 0,year,displ,cylinders,city08,highway08,comb08,barrels08,make,model,trany,VClass,fuelType,co2
0,2004.0,3.0,6.0,16.0,22.0,18.0,16.528333,Acura,NSX,Automatic (S4),Two Seaters,Premium,-1.0
1,2004.0,3.0,6.0,16.0,22.0,18.0,16.528333,Acura,NSX,Automatic (S4),Two Seaters,Premium,-1.0
2,2004.0,3.2,6.0,16.0,22.0,18.0,16.528333,Acura,NSX,Manual 6-spd,Two Seaters,Premium,-1.0
3,2004.0,3.2,6.0,16.0,22.0,18.0,16.528333,Acura,NSX,Manual 6-spd,Two Seaters,Premium,-1.0
4,2004.0,1.8,4.0,18.0,26.0,21.0,14.167143,Audi,TT Roadster,Automatic (S6),Two Seaters,Premium,-1.0


year            0
displ        2227
cylinders    2229
city08          0
highway08       0
comb08          0
barrels08       0
make            0
model           0
trany           0
VClass          0
fuelType        0
co2             0
dtype: int64


In [43]:
print(data.isnull().sum())

year            0
displ        2227
cylinders    2229
city08          0
highway08       0
comb08          0
barrels08       0
make            0
model           0
trany           0
VClass          0
fuelType        0
co2             0
dtype: int64


In [None]:
# training and testing part
X = data.drop(columns=[target_col])
y = data[target_col].astype(float)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_test.shape)


(42348, 12) (10587, 12)


In [None]:
# building preprocessing pipeline
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer( #applies different transformations to different columns
    transformers=[
        ('num', num_transformer, num_feats),
        ('cat', cat_transformer, cat_feats)
    ], remainder='drop'
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])


In [None]:
# training model
print("Training model (this may take a minute)...")
model.fit(X_train, y_train)
print("Training complete.")


Training model (this may take a minute)...
Training complete.


In [None]:
# evaluating
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Test RMSE: {rmse:.2f} g/mile")
print(f"Test MAE: {mae:.2f} g/mile")
print(f"Test R2: {r2:.3f}")


Test RMSE: 3.22 g/mile
Test MAE: 0.56 g/mile
Test R2: 1.000


In [40]:
# Cell 13: save the trained model
OUT_DIR = Path("model")
OUT_DIR.mkdir(exist_ok=True)
joblib.dump(model, OUT_DIR / r"C:\Users\mahes\Desktop\tcb-project\notebook\model\co2_rf_model.joblib")

['C:\\Users\\mahes\\Desktop\\tcb-project\\notebook\\model\\co2_rf_model.joblib']

In [39]:
ex = X_test.sample(5, random_state=1)
preds = model.predict(ex)
result = ex.copy()
result['pred_co2'] = preds
display(result)


Unnamed: 0,year,displ,cylinders,city08,highway08,comb08,barrels08,make,model,trany,VClass,fuelType,pred_co2
49129,2024.0,2.4,4.0,20.0,26.0,22.0,13.523182,Subaru,Ascent,Automatic (AV-S8),Standard Sport Utility Vehicle 4WD,Regular,398.06
36616,2019.0,3.0,6.0,21.0,28.0,24.0,12.39625,Mercedes-Benz,AMG E53 4matic Plus,Automatic 9-spd,Midsize Cars,Premium,370.21
36119,2019.0,2.0,4.0,22.0,32.0,26.0,11.442692,Buick,Regal,Automatic (S9),Midsize Cars,Premium,343.78
47303,2023.0,5.7,8.0,15.0,20.0,17.0,17.500588,Ram,1500 Classic 4WD,Automatic 8-spd,Standard Pickup Trucks 4WD,Midgrade,526.85
11808,2009.0,3.0,6.0,18.0,28.0,22.0,13.523182,BMW,328i,Automatic (S6),Compact Cars,Premium,-1.0
