In [102]:
import pickle

with open('../data/processed/train-processed-v2.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed-v2.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Fuel Consumption City (l/100km)',
       'Fuel Consumption Hwy (l/100km)',
       'Fuel Consumption Comb (l/100km)', 'Engine Size(L)', 'Cylinders',
       'Id', 'Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
       'CO2 Emissions(g/km)'], dtype=object)

In [103]:
train.dtypes

Fuel Consumption City (l/100km)    float64
Fuel Consumption Hwy (l/100km)     float64
Fuel Consumption Comb (l/100km)    float64
Engine Size(L)                     float64
Cylinders                          float64
Id                                   int64
Make                                 int64
Vehicle Class                      float64
Transmission                       float64
Fuel Type                          float64
CO2 Emissions(g/km)                float64
dtype: object

In [104]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 195917
Number of data points in training set: 137141
Number of data points in test set: 58776
Ratio of train to test data: 30.00%


In [105]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (137141, 11)
Test shape: (58776, 10)


In [106]:
train.head()

Unnamed: 0,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km),Engine Size(L),Cylinders,Id,Make,Vehicle Class,Transmission,Fuel Type,CO2 Emissions(g/km)
0,-0.906532,-0.930067,0.025409,0.616836,0.584975,1,7,6.0,3.0,3.0,306.0
1,-0.915851,-0.949508,-1.023942,2.344387,1.426059,2,4,6.0,3.0,3.0,283.0
2,-0.247084,-0.880887,0.067026,1.480612,1.426059,3,2,10.0,23.0,4.0,329.0
3,0.887375,-0.016345,0.067026,-0.32372,-0.816831,4,11,11.0,13.0,3.0,270.0
4,-0.914324,1.531376,-0.012245,0.136961,0.584975,5,1,3.0,16.4,4.0,193.0


In [107]:
# Bagi data menjadi fitur dan target
X_train = train.drop(['CO2 Emissions(g/km)'], axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [108]:
from xgboost import XGBRegressor

params_xgb = {
    'objective': 'reg:squarederror',  # Specify the objective for regression tasks
    'colsample_bytree': 0.8,          # Fraction of features to be randomly sampled for each tree
    'learning_rate': 0.08,             # Step size shrinkage to prevent overfitting
    'max_depth': 128,                  # Maximum depth of a tree
    'alpha': 10,                      # L1 regularization term on weights
    'n_estimators': 10000,             # Number of boosting rounds
    'random_state': 472389
}

model_xgb = XGBRegressor(**params_xgb)

model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)


In [109]:
test

Unnamed: 0,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km),Engine Size(L),Cylinders,Id,Make,Vehicle Class,Transmission,Fuel Type
0,0.920593,-0.865100,0.283683,0.748470,0.325779,137142,3,2.0,15.0,3.0
1,-0.910920,-0.939220,1.495285,0.846481,0.608439,137143,7,15.0,13.0,3.0
2,1.964239,0.589944,1.498660,-0.329646,-0.522202,137144,16,11.0,19.0,3.0
3,-0.216649,1.428867,-1.017581,1.728575,1.456420,137145,14,2.0,15.0,4.0
4,-0.258921,1.915538,-1.021649,1.532554,2.021740,137146,2,12.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...
58771,1.241548,-0.233801,-0.311042,0.454439,-0.239542,195913,11,11.0,3.0,3.0
58772,-0.188727,0.037745,-0.137411,-1.211740,-0.804862,195914,7,9.2,13.0,3.0
58773,0.787471,0.696299,0.775921,0.650460,0.891099,195915,4,12.0,3.0,3.0
58774,0.073664,-0.262323,-0.091855,2.120617,2.021740,195916,7,6.2,13.0,3.0


In [110]:
submission_3 = test.loc[:, ["Id"]]

submission_3['CO2 Emissions(g/km)'] = y_pred

In [111]:
submission_3

Unnamed: 0,Id,CO2 Emissions(g/km)
0,137142,242.520065
1,137143,347.479736
2,137144,211.449829
3,137145,285.716339
4,137146,314.794312
...,...,...
58771,195913,265.171692
58772,195914,200.791443
58773,195915,298.636536
58774,195916,300.951630


In [112]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission_3.to_csv('../data/submission/submission_4.1.csv', index=False)