In [2]:
import pickle

with open('../data/processed/train-processed.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Make', 'Vehicle Class', 'Transmission', 'Id', 'Engine Size(L)',
       'Cylinders', 'Fuel Type', 'CO2 Emissions(g/km)',
       'Fuel Consumption City (km/l)', 'Fuel Consumption Hwy (km/l)',
       'Fuel Consumption Comb (km/l)'], dtype=object)

In [3]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 195917
Number of data points in training set: 137141
Number of data points in test set: 58776
Ratio of train to test data: 30.00%


In [4]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (137141, 11)
Test shape: (58776, 10)


In [5]:
# Bagi data menjadi fitur dan target
X_train = train.drop('CO2 Emissions(g/km)', axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [11]:
from xgboost import XGBRegressor

params_xgb = {
    'objective': 'reg:squarederror',  # Specify the objective for regression tasks
    'colsample_bytree': 0.8,          # Fraction of features to be randomly sampled for each tree
    'learning_rate': 0.1,             # Step size shrinkage to prevent overfitting
    'max_depth': 16,                   # Maximum depth of a tree
    'alpha': 10,                      # L1 regularization term on weights
    'n_estimators': 100,              # Number of boosting rounds
    'subsample': 0.8,                 # Fraction of samples used for each boosting round
    'random_state': 42
}

model_xgb = XGBRegressor(**params_xgb)

model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)


In [7]:
test

Unnamed: 0,Make,Vehicle Class,Transmission,Id,Engine Size(L),Cylinders,Fuel Type,Fuel Consumption City (km/l),Fuel Consumption Hwy (km/l),Fuel Consumption Comb (km/l)
0,3,3,16,137142,3.600000,5.09832,4,6.400000,177.000000,10.645606
1,7,16,14,137143,3.700000,6.00000,4,621.563732,1438.000000,5.530000
2,16,12,20,137144,2.949464,5.09832,4,4.081382,9.717465,5.522621
3,14,3,16,137145,4.600000,5.09832,5,16.800000,6.290000,1642.000000
4,2,13,16,137146,4.400000,8.00000,5,2.402064,5.221589,556.380979
...,...,...,...,...,...,...,...,...,...,...
58771,11,12,4,195913,3.300000,5.09832,4,5.448152,20.900000,19.500000
58772,7,23,14,195914,1.600000,4.00000,4,16.155472,15.152132,15.690000
58773,4,13,4,195915,3.500000,5.09832,4,6.900000,9.089579,7.737621
58774,7,22,14,195916,5.000000,8.00000,4,11.874272,21.767373,14.924893


In [8]:
submission_3 = test.loc[:, ["Id"]]

submission_3['CO2 Emissions(g/km)'] = y_pred

In [9]:
submission_3

Unnamed: 0,Id,CO2 Emissions(g/km)
0,137142,226.989914
1,137143,319.164520
2,137144,225.152481
3,137145,267.982544
4,137146,318.191223
...,...,...
58771,195913,251.984879
58772,195914,212.531967
58773,195915,282.414642
58774,195916,294.941742


In [10]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission_3.to_csv('../data/submission/submission_3.csv', index=False)