In [26]:
import pickle

with open('../data/processed/train-processed-v4.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed-v4.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Fuel Consumption City (l/100km)',
       'Fuel Consumption Hwy (l/100km)',
       'Fuel Consumption Comb (l/100km)', 'Engine Size(L)', 'Cylinders',
       'Id', 'Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
       'CO2 Emissions(g/km)'], dtype=object)

In [27]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 78482
Number of data points in training set: 54937
Number of data points in test set: 23545
Ratio of train to test data: 30.00%


In [28]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (54937, 11)
Test shape: (23545, 10)


In [29]:
# Bagi data menjadi fitur dan target
X_train = train.drop('CO2 Emissions(g/km)', axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [30]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)

params_lgb = {
    'objective': 'regression',  # 'regression' for regression tasks
    'metric': 'rmse',           # Root Mean Squared Error (RMSE) for regression
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Tree
    'num_leaves': 512,          # Maximum tree leaves for base learners
    'learning_rate': 0.08,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
}

num_round = 100
model_lgb = lgb.train(params_lgb, train_data, num_round)
y_pred = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1274
[LightGBM] [Info] Number of data points in the train set: 54937, number of used features: 10
[LightGBM] [Info] Start training from score 244.656152


In [31]:
test

Unnamed: 0,Fuel Consumption City (l/100km),Engine Size(L),Cylinders,Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km),Id,Make,Vehicle Class,Transmission,Fuel Type
0,-1.151715,1.967762,1.870520,-1.213536,1.431103,54938,7,6.0,11.0,3.0
1,-1.145496,-0.739679,-0.818868,0.089003,0.214311,54939,2,0.0,5.0,4.0
2,0.444772,-1.371415,-0.818868,-1.201381,-1.201659,54940,10,11.0,6.0,3.0
3,0.000433,-0.739679,-0.818868,0.575176,0.000648,54941,14,11.0,13.0,4.0
4,0.224357,-0.739679,-0.818868,0.210546,-1.199848,54942,1,0.0,23.0,4.0
...,...,...,...,...,...,...,...,...,...,...
23540,0.000433,-0.739679,-0.818868,0.291575,-1.201659,78478,16,11.0,22.0,3.0
23541,0.207962,-1.281167,-0.818868,-0.113569,1.259086,78479,4,0.0,23.0,3.0
23542,-1.149228,-0.775778,-0.818868,-1.203407,0.618097,78480,11,11.0,13.0,3.0
23543,0.492465,-0.378687,-0.818868,1.182892,0.665176,78481,10,11.0,6.0,3.0


In [32]:
y_pred.shape

(23545,)

In [33]:
submission = test.loc[:, ["Id"]]

submission['CO2 Emissions(g/km)'] = y_pred

In [34]:
submission

Unnamed: 0,Id,CO2 Emissions(g/km)
0,54938,319.868795
1,54939,238.303194
2,54940,179.576460
3,54941,259.668909
4,54942,269.952301
...,...,...
23540,78478,259.371580
23541,78479,338.671280
23542,78480,335.320574
23543,78481,325.906280


In [35]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission.to_csv('../data/submission/submission_7.csv', index=False)