In [2]:
import pickle

with open('../data/processed/train-processed-v2.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('../data/processed/test-processed-v2.pkl', 'rb') as f:
    test = pickle.load(f)
    
train.columns.to_numpy()

array(['Fuel Consumption City (l/100km)',
       'Fuel Consumption Hwy (l/100km)',
       'Fuel Consumption Comb (l/100km)', 'Engine Size(L)', 'Cylinders',
       'Id', 'Make', 'Vehicle Class', 'Transmission', 'Fuel Type',
       'CO2 Emissions(g/km)'], dtype=object)

In [3]:
total_data = len(train) + len(test)
train_test_ratio = (len(test) / total_data) * 100

print(f'Total number of data points: {total_data}')
print(f'Number of data points in training set: {len(train)}')
print(f'Number of data points in test set: {len(test)}')
print(f'Ratio of train to test data: {train_test_ratio:.2f}%')

Total number of data points: 195917
Number of data points in training set: 137141
Number of data points in test set: 58776
Ratio of train to test data: 30.00%


In [4]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (137141, 11)
Test shape: (58776, 10)


In [5]:
# Bagi data menjadi fitur dan target
X_train = train.drop('CO2 Emissions(g/km)', axis=1)
y_train = train['CO2 Emissions(g/km)']

X_test = test.copy()

In [10]:
import lightgbm as lgb

train_data = lgb.Dataset(X_train, label=y_train)

params_lgb = {
    'objective': 'regression',  # 'regression' for regression tasks
    'metric': 'rmse',           # Root Mean Squared Error (RMSE) for regression
    'boosting_type': 'gbdt',    # Gradient Boosting Decision Tree
    'num_leaves': 512,          # Maximum tree leaves for base learners
    'learning_rate': 0.08,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
}

num_round = 100
model_lgb = lgb.train(params_lgb, train_data, num_round)
y_pred = model_lgb.predict(X_test, num_iteration=model_lgb.best_iteration)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054993 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1451
[LightGBM] [Info] Number of data points in the train set: 137141, number of used features: 10
[LightGBM] [Info] Start training from score 242.370040


In [11]:
test

Unnamed: 0,Fuel Consumption City (l/100km),Fuel Consumption Hwy (l/100km),Fuel Consumption Comb (l/100km),Engine Size(L),Cylinders,Id,Make,Vehicle Class,Transmission,Fuel Type
0,0.920593,-0.865100,0.283683,0.748470,0.325779,137142,3,2.0,15.0,3.0
1,-0.910920,-0.939220,1.495285,0.846481,0.608439,137143,7,15.0,13.0,3.0
2,1.964239,0.589944,1.498660,-0.329646,-0.522202,137144,16,11.0,19.0,3.0
3,-0.216649,1.428867,-1.017581,1.728575,1.456420,137145,14,2.0,15.0,4.0
4,-0.258921,1.915538,-1.021649,1.532554,2.021740,137146,2,12.0,15.0,4.0
...,...,...,...,...,...,...,...,...,...,...
58771,1.241548,-0.233801,-0.311042,0.454439,-0.239542,195913,11,11.0,3.0,3.0
58772,-0.188727,0.037745,-0.137411,-1.211740,-0.804862,195914,7,9.2,13.0,3.0
58773,0.787471,0.696299,0.775921,0.650460,0.891099,195915,4,12.0,3.0,3.0
58774,0.073664,-0.262323,-0.091855,2.120617,2.021740,195916,7,6.2,13.0,3.0


In [12]:
y_pred.shape

(58776,)

In [13]:
submission = test.loc[:, ["Id"]]

submission['CO2 Emissions(g/km)'] = y_pred

In [14]:
submission

Unnamed: 0,Id,CO2 Emissions(g/km)
0,137142,247.362504
1,137143,365.576392
2,137144,231.521060
3,137145,305.291790
4,137146,317.025222
...,...,...
58771,195913,270.021859
58772,195914,182.816651
58773,195915,302.998465
58774,195916,313.651689


In [15]:
import os

if not os.path.exists('../data/submission'):
    os.makedirs('../data/submission')

submission.to_csv('../data/submission/submission_4.4.csv', index=False)