In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('co2.csv')
df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


# Preprocessing - Zorunlu olanlar

In [3]:
df.drop(columns=['Model'], inplace=True)

In [4]:
df.isnull().sum().sum()

0

In [5]:
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
Make,7385,42,FORD,628
Vehicle Class,7385,16,SUV - SMALL,1217
Transmission,7385,27,AS6,1324
Fuel Type,7385,5,X,3637


In [6]:
target_column = 'CO2 Emissions(g/km)'
X = df.drop(columns=target_column)
y = df[target_column]

print(X.shape)
print(y.shape)

(7385, 10)
(7385,)


In [8]:
categorical_features = X.select_dtypes(include='object').columns
X_encoded = pd.get_dummies(X, columns=categorical_features, prefix=categorical_features, prefix_sep='*', drop_first=True)
X_encoded.shape

(7385, 92)

In [13]:
X_encoded.sample(3)

Unnamed: 0,Engine Size(L),Cylinders,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),Make*ALFA ROMEO,Make*ASTON MARTIN,Make*AUDI,Make*BENTLEY,...,Transmission*AV6,Transmission*AV7,Transmission*AV8,Transmission*M5,Transmission*M6,Transmission*M7,Fuel Type*E,Fuel Type*N,Fuel Type*X,Fuel Type*Z
6286,3.5,6,12.1,8.9,10.7,26,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1844,3.5,6,11.9,8.2,10.2,28,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6434,1.8,4,7.8,5.9,6.9,41,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Train-Test Split

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.preprocessing import StandardScaler

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5908, 92)
(5908,)
(1477, 92)
(1477,)


# Scaling

In [16]:
scaler = StandardScaler()

In [17]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled =scaler.transform(X_test)

# model Building

## LinearRegression

In [61]:
from sklearn.linear_model import LinearRegression, LassoCV, Lasso, RidgeCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [70]:
linear_model = LinearRegression()

In [71]:
linear_model.fit(X_train_scaled, y_train)

In [72]:
y_pred = linear_model.predict(X_test_scaled)
y_train_pred = linear_model.predict(X_train_scaled)

In [35]:
def train_val(y_train, y_train_pred, y_test, y_pred, model_name):
    
    scores = {
        f'{model_name}_train': {
            "R2" : r2_score(y_train, y_train_pred),
            "mae" : mean_absolute_error(y_train, y_train_pred),
            "mse" : mean_squared_error(y_train, y_train_pred),
            "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred))
        },
        f'{model_name}_test':  {
            "R2" : r2_score(y_test, y_pred),
            "mae" : mean_absolute_error(y_test, y_pred),
            "mse" : mean_squared_error(y_test, y_pred),
            "rmse" : np.sqrt(mean_squared_error(y_test, y_pred))
        }
    }
    
    return pd.DataFrame(scores)

In [73]:
lm_scores = train_val(y_train, y_train_pred, y_test, y_pred, 'Linear model')
lm_scores

Unnamed: 0,Linear model_train,Linear model_test
R2,0.993863,0.990794
mae,2.83042,3.104245
mse,20.980738,31.664477
rmse,4.580474,5.62712


In [74]:
scoring = ['r2', 'neg_root_mean_squared_error']

pd.DataFrame(cross_validate(linear_model, X_train_scaled, y_train, cv=10, scoring=scoring))

Unnamed: 0,fit_time,score_time,test_r2,test_neg_root_mean_squared_error
0,0.015965,0.001504,0.993171,-4.84095
1,0.019371,0.001993,0.994606,-4.026285
2,0.015518,0.001505,0.991667,-5.531995
3,0.016994,0.000999,0.993132,-4.580688
4,0.013785,0.001371,0.993541,-4.751072
5,0.017122,0.000998,0.993164,-4.708233
6,0.013203,0.001,0.995072,-4.212486
7,0.015193,0.000505,0.994743,-4.301809
8,0.016525,0.000999,0.992693,-4.860506
9,0.015505,0.001503,0.993956,-4.846732


## LassoCV Model

In [46]:
alpha_spaces = np.logspace(-3,2,100)

In [52]:
lasso_cv_model = LassoCV(alphas= alpha_spaces, cv=10, random_state=42, max_iter=10_000)

In [59]:
lasso_cv_model.fit(X_train_scaled, y_train)
y_pred = lasso_cv_model.predict(X_test_scaled)
y_train_pred = lasso_cv_model.predict(X_train_scaled)

lasso_scores = train_val(y_train, y_train_pred, y_test, y_pred, 'Lasso model')
lasso_scores

Unnamed: 0,Lasso model_train,Lasso model_test
R2,0.993857,0.990797
mae,2.828324,3.099226
mse,21.000114,31.655623
rmse,4.582588,5.626333


## RidgeCv

In [63]:
ridge_cv_model = RidgeCV(alphas= alpha_spaces, cv=10)

In [64]:
ridge_cv_model.fit(X_train_scaled, y_train)
y_pred = ridge_cv_model.predict(X_test_scaled)
y_train_pred = ridge_cv_model.predict(X_train_scaled)

ridge_scores = train_val(y_train, y_train_pred, y_test, y_pred, 'ridge model')
ridge_scores

Unnamed: 0,ridge model_train,ridge model_test
R2,0.993862,0.990805
mae,2.832782,3.10587
mse,20.981672,31.628916
rmse,4.580575,5.623959


In [87]:
len(X_encoded.columns)

92

In [88]:
len(ridge_cv_model.coef_)

92

In [84]:
coefficients = np.abs(ridge_cv_model.coef_)

# Feature importance dataframe'i oluşturun
feature_importance_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': coefficients})

# Önem sırasına göre sıralayın
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Sonucu görüntüleyin
feature_importance_df.head(10)

Unnamed: 0,Feature,Importance
88,Fuel Type*E,29.869361
2,Fuel Consumption City (L/100 km),22.743763
4,Fuel Consumption Comb (L/100 km),21.048629
90,Fuel Type*X,15.217515
91,Fuel Type*Z,14.496396
3,Fuel Consumption Hwy (L/100 km),13.331719
5,Fuel Consumption Comb (mpg),6.67558
1,Cylinders,1.589904
18,Make*FORD,1.552733
14,Make*CHEVROLET,1.156187


# Feature Selection
1. Fuel Type
2. fuel Consumption (City)