In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
import preprocessing as ps
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

warnings.filterwarnings('ignore')
sns.set_theme(style="whitegrid", palette="Set2")

In [2]:
df=pd.read_csv('../datasets/clean_dataset.csv')
df.head()

Unnamed: 0,make,vehicle_class,engine_liters,cylinders,transmission_type,fuel_type,fuel_city_l_100km,fuel_hwy_l_100km,fuel_comb_l_100km,fuel_comb_mpg,co2_emission,co2_rating,smog_rating
0,0,7,3.5,6,8,3,12.6,9.4,11.2,25.0,263.0,4.0,5.0
1,0,7,2.0,4,8,3,11.0,8.6,9.9,29.0,232.0,5.0,6.0
2,0,7,2.0,4,8,3,11.3,9.1,10.3,27.0,242.0,5.0,6.0
3,0,0,2.0,4,8,3,11.2,8.0,9.8,29.0,230.0,5.0,7.0
4,0,0,2.0,4,8,3,11.3,8.1,9.8,29.0,231.0,5.0,7.0


<h3>Data Overview</h3>

In [4]:
x = df.drop(columns=['co2_emission'])  # Features (all columns except 'price')
y = df['co2_emission']

In [5]:
y=np.log1p(df['co2_emission'])

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [7]:
# Train Model Function (updated for regression)
def train_model(model, model_name, x_train, y_train, x_test, y_test, is_classification=True):
    
    # Fit the model on the training data
    model.fit(x_train, y_train)
    
    # Predictions on the training data
    y_train_pred = model.predict(x_train)
    # Predictions on the testing data
    y_test_pred = model.predict(x_test)
    
    # Collecting results in a dictionary
    model_results = {
        'model': model_name
    }
    
    if is_classification:
        # Classification metrics
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        model_results['train_accuracy'] = train_accuracy
        model_results['test_accuracy'] = test_accuracy
        
        # Generate classification report for testing data (optional)
        report = classification_report(y_test, y_test_pred, output_dict=True)
        model_results['classification_report'] = report
    else:
        # Regression metrics
        train_mse = mean_squared_error(y_train, y_train_pred)
        test_mse = mean_squared_error(y_test, y_test_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        model_results['train_mse'] = train_mse
        model_results['test_mse'] = test_mse
        model_results['train_r2'] = train_r2
        model_results['test_r2'] = test_r2
    
    return model_results

# List of models (for regression or classification tasks)
model_list = dict(
    linear_regression=LinearRegression(),
    random_forest=RandomForestRegressor(n_estimators=50, random_state=42),
    xgboost=xgb.XGBRegressor(max_depth=1),
)

# Assume you have your x_train, x_test, y_train, and y_test already defined
results = []
for key, value in model_list.items():
    # Check if classification or regression model and train accordingly
    is_classification =False  # change as per your use-case for classification vs regression
    model_results = train_model(value, key, x_train, y_train, x_test, y_test, is_classification)
    results.append(model_results)

# Convert the results into a DataFrame
results_df = pd.DataFrame(results)

# Print or return the results DataFrame
results_df

Unnamed: 0,model,train_mse,test_mse,train_r2,test_r2
0,linear_regression,0.000925,0.000994,0.984764,0.982618
1,random_forest,4.2e-05,0.000175,0.999311,0.996933
2,xgboost,0.00055,0.000708,0.990937,0.987611


In [8]:
model=RandomForestRegressor(n_estimators=50, random_state=42)
model.fit(x_train, y_train)

In [9]:
x.iloc[0,:]


make                  0.0
vehicle_class         7.0
engine_liters         3.5
cylinders             6.0
transmission_type     8.0
fuel_type             3.0
fuel_city_l_100km    12.6
fuel_hwy_l_100km      9.4
fuel_comb_l_100km    11.2
fuel_comb_mpg        25.0
co2_rating            4.0
smog_rating           5.0
Name: 0, dtype: float64

In [10]:
model.predict(x.iloc[0,:].values.reshape(1,-1))


array([5.57465475])

In [11]:
y[0]


5.575949103146316

In [12]:
model.predict(x_test.iloc[0,:].values.reshape(1,-1))


array([5.30943027])

In [13]:
y_test.iloc[0]


5.308267697401205