In [11]:
import pickle

with open('../data/processed/train-processed-v3.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed-v3.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Fuel Consumption City (l/100km)',
       'Fuel Consumption Hwy (l/100km)',
       'Fuel Consumption Comb (l/100km)', 'Engine Size(L)', 'Cylinders',
       'Id', 'Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
       'CO2 Emissions(g/km)'], dtype=object)

In [12]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 78482
Number of data points in training set: 54937
Number of data points in test set: 23545
Ratio of train to test data: 30.00%


In [13]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (54937, 11)
Test shape: (23545, 10)


In [14]:
# Bagi data menjadi fitur dan target
X_train = train.drop('CO2 Emissions(g/km)', axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [15]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)

params_lgb = {
    'objective': 'regression',  # 'regression' for regression tasks
    'metric': 'rmse',           # Root Mean Squared Error (RMSE) for regression
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Tree
    'num_leaves': 512,          # Maximum tree leaves for base learners
    'learning_rate': 0.08,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
}

num_round = 100
model_lgb = lgb.train(params_lgb, train_data, num_round)
y_pred = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1531
[LightGBM] [Info] Number of data points in the train set: 54937, number of used features: 10
[LightGBM] [Info] Start training from score 244.656152


In [16]:
test

Unnamed: 0,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km),Engine Size(L),Cylinders,Id,Make,Vehicle Class,Transmission,Fuel Type
0,-1.151715,-1.214411,1.430393,1.967723,1.873681,54938,7,6.0,11.0,3.0
1,-1.145496,0.088944,0.214390,-0.740459,-0.819907,54939,2,0.0,5.0,4.0
2,0.444772,-1.201767,-1.201712,-1.372368,-0.819907,54940,10,11.0,6.0,3.0
3,0.000433,0.575070,0.000000,-0.740459,-0.819907,54941,14,11.0,13.0,4.0
4,0.224357,0.210388,-1.199045,-0.740459,-0.819907,54942,1,0.0,23.0,4.0
...,...,...,...,...,...,...,...,...,...,...
23540,0.000433,0.291940,-1.201403,-0.740459,-0.819907,78478,16,11.0,22.0,3.0
23541,0.207962,-0.113475,1.258849,-1.282096,-0.819907,78479,4,0.0,23.0,3.0
23542,-1.149228,-1.203622,0.618579,-0.586996,-0.819907,78480,11,11.0,13.0,3.0
23543,0.492465,1.183846,0.664583,-0.379368,-0.819907,78481,10,11.0,6.0,3.0


In [17]:
y_pred.shape

(23545,)

In [18]:
submission = test.loc[:, ["Id"]]

submission['CO2 Emissions(g/km)'] = y_pred

In [19]:
submission

Unnamed: 0,Id,CO2 Emissions(g/km)
0,54938,326.716482
1,54939,199.661104
2,54940,213.611424
3,54941,234.430105
4,54942,232.499414
...,...,...
23540,78478,210.237664
23541,78479,172.544604
23542,78480,231.630374
23543,78481,225.029511


In [20]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission.to_csv('../data/submission/submission_5.csv', index=False)