In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [30]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [31]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398648 entries, 0 to 398647
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   waktu_setempat    398648 non-null  object 
 1   id_jalan          398648 non-null  int64  
 2   id_titik_mulai    398648 non-null  int64  
 3   id_titik_akhir    398648 non-null  int64  
 4   rerata_kecepatan  398648 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 15.2+ MB


In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 127489 entries, 0 to 127488
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              127489 non-null  int64 
 1   waktu_setempat  127489 non-null  object
 2   id_jalan        127489 non-null  int64 
 3   id_titik_mulai  127489 non-null  int64 
 4   id_titik_akhir  127489 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 4.9+ MB


In [33]:
train_df['id_jalan'].nunique()

20

In [34]:
test_df['id_jalan'].nunique()

20

In [35]:
train_df['waktu_setempat'] = pd.to_datetime(train_df['waktu_setempat'])
test_df['waktu_setempat'] = pd.to_datetime(test_df['waktu_setempat'])

In [36]:
train_df['hour'] = train_df['waktu_setempat'].dt.hour
train_df['day_of_week'] = train_df['waktu_setempat'].dt.dayofweek
train_df['month'] = train_df['waktu_setempat'].dt.month
train_df['week'] = train_df['waktu_setempat'].dt.isocalendar().week

train_df['week'] = train_df['week'].astype(int)


In [37]:
test_df['hour'] = test_df['waktu_setempat'].dt.hour
test_df['day_of_week'] = test_df['waktu_setempat'].dt.dayofweek
test_df['month'] = test_df['waktu_setempat'].dt.month
test_df['week'] = test_df['waktu_setempat'].dt.isocalendar().week

test_df['week'] = test_df['week'].astype(int)

In [38]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398648 entries, 0 to 398647
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype              
---  ------            --------------   -----              
 0   waktu_setempat    398648 non-null  datetime64[ns, UTC]
 1   id_jalan          398648 non-null  int64              
 2   id_titik_mulai    398648 non-null  int64              
 3   id_titik_akhir    398648 non-null  int64              
 4   rerata_kecepatan  398648 non-null  float64            
 5   hour              398648 non-null  int32              
 6   day_of_week       398648 non-null  int32              
 7   month             398648 non-null  int32              
 8   week              398648 non-null  int32              
dtypes: datetime64[ns, UTC](1), float64(1), int32(4), int64(3)
memory usage: 21.3 MB


In [39]:
train_df.drop(columns=['waktu_setempat'], inplace=True)
test_df.drop(columns=['waktu_setempat'], inplace=True)

In [40]:
# train_df['waktu_setempat'] = pd.to_datetime(train_df['waktu_setempat'])
# train_df['date'] = train_df['waktu_setempat'].dt.date
# train_df['time'] = train_df['waktu_setempat'].dt.time
# train_df.drop(columns=['waktu_setempat'], inplace=True)

# test_df['waktu_setempat'] = pd.to_datetime(test_df['waktu_setempat'])
# test_df['date'] = test_df['waktu_setempat'].dt.date
# test_df['time'] = test_df['waktu_setempat'].dt.time
# test_df.drop(columns=['waktu_setempat'], inplace=True)

In [12]:
## Train - Test Split
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Function to calculate sMAPE
def smape(actual, forecast):
    return 100 * np.mean(2 * np.abs(forecast - actual) / (np.abs(actual) + np.abs(forecast)))

X = train_df.drop(columns=['rerata_kecepatan'])
y = train_df['rerata_kecepatan']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [13]:
# Linear Regression
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
y_pred_linear_reg = linear_reg_model.predict(X_test)
smape_linear_reg = smape(y_test, y_pred_linear_reg)
print("Linear Regression sMAPE:", smape_linear_reg)

Linear Regression sMAPE: 17.38324254375391


In [108]:
# Step 5: ElasticNet Model Training and Evaluation
elasticnet_model = ElasticNet()
elasticnet_model.fit(X_train, y_train)
y_pred_elasticnet = elasticnet_model.predict(X_test)
smape_elasticnet = smape(y_test, y_pred_elasticnet)
print("ElasticNet sMAPE:", smape_elasticnet)

ElasticNet sMAPE: 17.38347003096355


In [15]:
from xgboost import XGBRegressor

# XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
smape_xgb = smape(y_test, y_pred_xgb)
print("XGBoost sMAPE:", smape_xgb)

XGBoost sMAPE: 8.92270550653233


In [41]:
id_series = test_df['id']
id_df = pd.DataFrame({'id': id_series})

In [42]:
test_df.drop(columns=['id'], inplace=True)

In [128]:
import optuna
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# Define the sMAPE function
def smape(y_true, y_pred):
    return 200 * np.mean(np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameters to be optimized
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 1.0),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
    }

    # Create the XGBoost model with the suggested hyperparameters
    model = xgb.XGBRegressor(**params)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate the sMAPE
    smape_score = smape(y_test, y_pred)

    return smape_score


In [131]:
# Create an Optuna study object
study = optuna.create_study(direction='minimize')

# Start the optimization process
study.optimize(objective, n_trials=25)

# Get the best hyperparameters found by Optuna
best_params = study.best_params

# Create the XGBoost model with the best hyperparameters
xgb_model = xgb.XGBRegressor(**best_params)

# Train the model on the full training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Calculate sMAPE
smape_xgb = smape(y_test, y_pred_xgb)

# Print the results
print("Best Hyperparameters:", best_params)
print("XGBoost sMAPE:", smape_xgb)

[I 2023-08-05 16:45:43,643] A new study created in memory with name: no-name-20e3642e-644f-43cf-b6f9-46af400385ab


[I 2023-08-05 16:46:29,521] Trial 0 finished with value: 65.84411927791868 and parameters: {'n_estimators': 570, 'learning_rate': 0.001178716157243451, 'max_depth': 6, 'subsample': 0.7909892317907676, 'colsample_bytree': 0.5977480701758112}. Best is trial 0 with value: 65.84411927791868.
[I 2023-08-05 16:47:38,376] Trial 1 finished with value: 7.853727149855469 and parameters: {'n_estimators': 393, 'learning_rate': 0.05612844421193374, 'max_depth': 10, 'subsample': 0.36939762212358285, 'colsample_bytree': 0.9198966780848803}. Best is trial 1 with value: 7.853727149855469.
[I 2023-08-05 16:47:55,828] Trial 2 finished with value: 12.619313577672978 and parameters: {'n_estimators': 427, 'learning_rate': 0.02771399532776737, 'max_depth': 4, 'subsample': 0.9610092174915718, 'colsample_bytree': 0.3319903885636597}. Best is trial 1 with value: 7.853727149855469.
[I 2023-08-05 16:48:27,681] Trial 3 finished with value: 8.960219171621759 and parameters: {'n_estimators': 373, 'learning_rate': 0.

Best Hyperparameters: {'n_estimators': 999, 'learning_rate': 0.06803356336734688, 'max_depth': 10, 'subsample': 0.5911186095161197, 'colsample_bytree': 0.724513135069416}
XGBoost sMAPE: 7.676739726468258


In [135]:
# KNeighborsRegressor
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split

# Create and train the KNeighborsRegressor model
knn_model = KNeighborsRegressor(n_neighbors=5, weights='uniform')  # You can change the hyperparameters here
knn_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_knn = knn_model.predict(X_test)

# Calculate sMAPE
smape_knn = smape(y_test, y_pred_knn)

# Print the sMAPE score
print("KNeighborsRegressor sMAPE:", smape_knn)

KNeighborsRegressor sMAPE: 8.038132606464304


In [16]:
# RandomForestRegressor
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Create and train the RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)  # You can change the hyperparameters here
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate sMAPE
smape_rf = smape(y_test, y_pred_rf)

# Print the sMAPE score
print("RandomForestRegressor sMAPE:", smape_rf)

RandomForestRegressor sMAPE: 8.301866824296635


In [17]:
# ExtraTreesRegressor
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

# Create and train the ExtraTreesRegressor model
etr_model = ExtraTreesRegressor(n_estimators=100, random_state=42)  # You can change the hyperparameters here
etr_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_etr = etr_model.predict(X_test)

# Calculate sMAPE
smape_etr = smape(y_test, y_pred_etr)

# Print the sMAPE score
print("ExtraTreesRegressor sMAPE:", smape_etr)


ExtraTreesRegressor sMAPE: 9.264608275482022


In [44]:
y_results = knn_model.predict(test_df)
y_results_df = pd.DataFrame({'rerata_kecepatan': y_results})

# Concatenate id_df and y_results_df horizontally
result_df = pd.concat([id_df, y_results_df], axis=1)
print(result_df)

# Store the concatenated DataFrame to a CSV file
result_df.to_csv('Data/submission.csv', index=False)

print("Data saved to 'data/submission.csv'")

            id  rerata_kecepatan
0            0         43.557128
1            1         36.630407
2            2         36.895161
3            3         43.895500
4            4         31.620071
...        ...               ...
127484  127484         31.794929
127485  127485         37.400357
127486  127486         39.474581
127487  127487         34.847019
127488  127488         44.942365

[127489 rows x 2 columns]
Data saved to 'data/submission.csv'
