In [47]:
import pickle

with open('../data/processed/train-processed-v2.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed-v2.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Make', 'Vehicle Class', 'Transmission', 'Id',
       'CO2 Emissions(g/km)', 'Engine Size(L)', 'Cylinders', 'Fuel Type',
       'Fuel Consumption City (l/100km)',
       'Fuel Consumption Hwy (l/100km)',
       'Fuel Consumption Comb (l/100km)'], dtype=object)

In [48]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 195917
Number of data points in training set: 137141
Number of data points in test set: 58776
Ratio of train to test data: 30.00%


In [49]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (137141, 11)
Test shape: (58776, 10)


In [50]:
train.head()

Unnamed: 0,Make,Vehicle Class,Transmission,Id,CO2 Emissions(g/km),Engine Size(L),Cylinders,Fuel Type,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km)
0,7,6,3,1,306.0,3.5,6.0,3,0.115741,0.162866,7.514789
1,4,6,3,2,283.0,5.3,5.157514,3,0.03667,0.03251,0.034674
2,2,10,23,3,329.0,4.4,5.157514,4,10.173097,0.492611,7.811451
3,11,11,13,4,270.0,2.989195,4.0,3,15.337423,6.289308,7.811451
4,1,3,13,5,193.0,3.0,6.0,4,0.049628,16.666667,7.246377


In [56]:
# Bagi data menjadi fitur dan target
X_train = train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [57]:
from xgboost import XGBRegressor

params_xgb = {
    'objective': 'reg:squarederror',  # Specify the objective for regression tasks
    'colsample_bytree': 0.8,          # Fraction of features to be randomly sampled for each tree
    'learning_rate': 0.08,             # Step size shrinkage to prevent overfitting
    'max_depth': 128,                  # Maximum depth of a tree
    'alpha': 10,                      # L1 regularization term on weights
    'n_estimators': 1000,             # Number of boosting rounds
    'random_state': 472389
}

model_xgb = XGBRegressor(**params_xgb)

model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)


In [None]:
test

Unnamed: 0,Make,Vehicle Class,Transmission,Id,Engine Size(L),Cylinders,Fuel Type,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km)
0,3,3,16,137142,3.6,5.09832,4,15.625000,0.564972,9.393570
1,7,16,14,137143,3.7,6.00000,4,0.047371,0.069541,18.083183
2,16,12,20,137144,2.0,5.09832,4,24.501563,10.290744,18.107390
3,14,3,16,137145,4.6,5.09832,5,5.952381,15.898251,0.060901
4,2,13,16,137146,4.4,8.00000,5,19.230769,19.151250,0.031726
...,...,...,...,...,...,...,...,...,...,...
58771,11,12,4,195913,3.3,5.09832,4,18.354837,4.784689,5.128205
58772,7,23,14,195914,1.6,4.00000,4,6.189868,6.599747,6.373486
58773,4,13,4,195915,3.5,5.09832,4,14.492754,11.001637,12.923901
58774,7,22,14,195916,5.0,8.00000,4,8.421590,4.594043,6.700212


In [58]:
submission_3 = test.loc[:, ["Id"]]

submission_3['CO2 Emissions(g/km)'] = y_pred

In [59]:
submission_3

Unnamed: 0,Id,CO2 Emissions(g/km)
0,137142,243.365555
1,137143,367.288177
2,137144,234.382019
3,137145,311.245270
4,137146,307.952454
...,...,...
58771,195913,285.188202
58772,195914,231.892853
58773,195915,288.778076
58774,195916,287.528717


In [60]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission_3.to_csv('../data/submission/submission_4.1.csv', index=False)