In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.ar_model import AutoReg

In [23]:
train_df = pd.read_csv('data/train ori.csv')
test_df = pd.read_csv('data/test ori.csv')

In [24]:
train_df['waktu_setempat'] = pd.to_datetime(train_df['waktu_setempat']).dt.tz_localize(None)
test_df['waktu_setempat'] = pd.to_datetime(test_df['waktu_setempat']).dt.tz_localize(None)

In [25]:
def interpolate_hourly(group):   
    group = group.set_index('waktu_setempat')
    group = group.resample('1H').first()
    
    # Linearly interpolate the 'rerata_kecepatan' column
    group['rerata_kecepatan'] = group['rerata_kecepatan'].interpolate(method='linear')
    
    # Reset index and forward fill non-'rerata_kecepatan' columns
    group = group.reset_index()
    group[['id_jalan', 'id_titik_mulai', 'id_titik_akhir']] = group[['id_jalan', 'id_titik_mulai', 'id_titik_akhir']].ffill()
    
    return group

# Apply the interpolation function to each group
train_df = train_df.groupby(['id_jalan', 'id_titik_mulai', 'id_titik_akhir']).apply(interpolate_hourly)

# Reset the index
train_df.reset_index(drop=True, inplace=True)

In [16]:
# from statsmodels.tsa.stattools import adfuller

# dftest = adfuller(train_df['rerata_kecepatan'], autolag = 'AIC')

# print("1. ADF : ", dftest[0])
# print("2. P-Value : ", dftest[1])
# print("3. Num Of Lags : ", dftest[2])
# print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
# print("5. Critical Values :")
# for key, val in dftest[4].items():
#     print("\t",key, ": ", val)

In [26]:
train_df['id_jalan'] = train_df['id_jalan'].astype(int)
train_df['id_titik_mulai'] = train_df['id_titik_mulai'].astype(int)
train_df['id_titik_akhir'] = train_df['id_titik_akhir'].astype(int)

In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492157 entries, 0 to 492156
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   waktu_setempat    492157 non-null  datetime64[ns]
 1   id_jalan          492157 non-null  int32         
 2   id_titik_mulai    492157 non-null  int32         
 3   id_titik_akhir    492157 non-null  int32         
 4   rerata_kecepatan  492157 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(3)
memory usage: 13.1 MB


In [28]:
# Remove outliers from 'rerata_kecepatan' column
Q1 = train_df['rerata_kecepatan'].quantile(0.25)
Q3 = train_df['rerata_kecepatan'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

train_df = train_df[(train_df['rerata_kecepatan'] >= lower_bound) & (
    train_df['rerata_kecepatan'] <= upper_bound)]

In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 477337 entries, 0 to 492156
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   waktu_setempat    477337 non-null  datetime64[ns]
 1   id_jalan          477337 non-null  int32         
 2   id_titik_mulai    477337 non-null  int32         
 3   id_titik_akhir    477337 non-null  int32         
 4   rerata_kecepatan  477337 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int32(3)
memory usage: 16.4 MB


In [31]:
train_df['unique_id'] = train_df['id_jalan'].astype(str) + '_' + train_df['id_titik_mulai'].astype(str) + '_' + train_df['id_titik_akhir'].astype(str)
test_df['unique_id'] = test_df['id_jalan'].astype(str) + '_' + test_df['id_titik_mulai'].astype(str) + '_' + test_df['id_titik_akhir'].astype(str)

In [10]:
# import pandas as pd
# import seaborn as sns
# # import matplotlib.pyplot as plt

# # Assuming you have already loaded the DataFrame as 'train_df'
# sns.set(style="whitegrid")
# plt.figure(figsize=(12, 8))
# sns.heatmap(train_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
# plt.title("Correlation Heatmap for train_df")
# plt.show()

In [32]:
train_df['waktu_setempat'] = pd.to_datetime(train_df['waktu_setempat']).dt.tz_localize(None)
test_df['waktu_setempat'] = pd.to_datetime(test_df['waktu_setempat']).dt.tz_localize(None)

In [34]:
train_df.drop(columns=['id_jalan','id_titik_mulai', 'id_titik_akhir'], inplace=True)
test_df.drop(columns=['id_jalan','id_titik_mulai', 'id_titik_akhir'], inplace=True)

In [36]:
train_df['unique_id'] = train_df['unique_id'].astype('category')
test_df['unique_id'] = test_df['unique_id'].astype('category')

In [37]:
id_series = test_df['id']
id_df = pd.DataFrame({'id': id_series})

In [46]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Assuming you have a DataFrame named 'train_df' with columns 'waktu_setempat', 'rerata_kecepatan', and 'unique_id'
# Assuming you have a DataFrame named 'test_df' with columns 'waktu_setempat' and 'unique_id'

# Convert the 'waktu_setempat' column to datetime if it's not already
train_df['waktu_setempat'] = pd.to_datetime(train_df['waktu_setempat'])
test_df['waktu_setempat'] = pd.to_datetime(test_df['waktu_setempat'])

# Sort the DataFrames by the datetime column
train_df.sort_values(by='waktu_setempat', inplace=True)
test_df.sort_values(by='waktu_setempat', inplace=True)

# Initialize an empty dictionary to store fitted AR models
fitted_models = {}

# Use fitted models to make predictions for each unique_id in the train DataFrame
for unique_id, group_data in train_df.groupby('unique_id'):
    # Fit an AR model using only the 'rerata_kecepatan' column
    ar_model = sm.OLS(group_data['rerata_kecepatan'], sm.add_constant(group_data.index)).fit()

    # Store the fitted model for the unique_id
    fitted_models[unique_id] = ar_model

# Initialize an empty list to store test forecasted DataFrames
test_forecasted_dfs = []

# Use fitted models to make predictions for each unique_id in the test DataFrame
for unique_id, group_data in test_df.groupby('unique_id'):
    if unique_id in fitted_models:
        model = fitted_models[unique_id]

        # Predict using the AR model on the test data
        forecast_index = np.arange(len(train_df), len(train_df) + len(group_data))
        forecast_test = model.predict(sm.add_constant(forecast_index))

        # Append test forecasted results to the list
        test_forecasted_dfs.append(
            pd.DataFrame({'unique_id': [unique_id] * len(forecast_test),
                          'waktu_setempat': group_data['waktu_setempat'],
                          'forecast': forecast_test})
        )
    else:
        print(f"Skipping unique_id {unique_id} (not in fitted_models)")

# Concatenate all test forecasted DataFrames into one
test_forecasted_results = pd.concat(test_forecasted_dfs, ignore_index=True)

# Print the test forecasted results
print(test_forecasted_results)


Skipping unique_id 1210_195855_5861520677 (not in fitted_models)
Skipping unique_id 1210_245009257_5861520677 (not in fitted_models)
Skipping unique_id 1210_5861520677_195855 (not in fitted_models)
Skipping unique_id 1210_5861520677_245009257 (not in fitted_models)
Skipping unique_id 142479648_1111592522_3775227674 (not in fitted_models)
Skipping unique_id 142479648_1111592522_3775231113 (not in fitted_models)
Skipping unique_id 142479648_26346748_4721090644 (not in fitted_models)
Skipping unique_id 142479648_26346760_4721090644 (not in fitted_models)
Skipping unique_id 142479648_26346770_4062334252 (not in fitted_models)
Skipping unique_id 142479648_26346790_3775227674 (not in fitted_models)
Skipping unique_id 142479648_3775227674_1111592522 (not in fitted_models)
Skipping unique_id 142479648_3775227674_26346790 (not in fitted_models)
Skipping unique_id 142479648_3775231113_1111592522 (not in fitted_models)
Skipping unique_id 142479648_3775231113_4062334248 (not in fitted_models)
Skip

In [47]:
test_forecasted_results

Unnamed: 0,unique_id,waktu_setempat,forecast
0,1210_1030634572_1030634588,2020-02-23 00:00:00,1698.891717
1,1210_1030634572_1030634588,2020-02-23 01:00:00,1698.895285
2,1210_1030634572_1030634588,2020-02-23 02:00:00,1698.898853
3,1210_1030634572_1030634588,2020-02-23 03:00:00,1698.902421
4,1210_1030634572_1030634588,2020-02-23 06:00:00,1698.905990
...,...,...,...
88447,97453767_33203791_20961350,2020-02-29 19:00:00,366.817052
88448,97453767_33203791_20961350,2020-02-29 20:00:00,366.821694
88449,97453767_33203791_20961350,2020-02-29 21:00:00,366.826335
88450,97453767_33203791_20961350,2020-02-29 22:00:00,366.830977


In [23]:
results = pd.concat([id_df, test_forecasted_results['yhat']], axis=1)
results.rename(columns={'yhat': 'rerata_kecepatan'}, inplace=True)

print(results)

            id  rerata_kecepatan
0            0         53.050744
1            1         52.474683
2            2         52.876252
3            3         54.175608
4            4         56.050348
...        ...               ...
127484  127484               NaN
127485  127485               NaN
127486  127486               NaN
127487  127487               NaN
127488  127488               NaN

[127489 rows x 2 columns]


In [24]:
# Store the concatenated DataFrame to a CSV file
results.to_csv('Data/results.csv', index=False)

print("Data saved")

PermissionError: [Errno 13] Permission denied: 'Data/results.csv'

In [None]:
# # Create a DataFrame with the future dates you want to forecast (for test)
# future_test = model.make_future_dataframe(periods=len(test_df), freq='H', include_history=False)

# # Make predictions for the test dataset
# forecast_test = model.predict(future_test)

# # Join forecasted values with the actual values in the test_df
# forecasted_test_df = test_df.join(forecast_test.set_index('ds')[['yhat', 'yhat_lower', 'yhat_upper']])
# forecasted_test_df


In [None]:
# id_series = test_df['id']
# id_df = pd.DataFrame({'id': id_series})

# test_df.drop(columns=['id'], inplace=True)
# # test_df.drop(columns=['id', 'lanes', 'lanes_forward', 'lanes_backward'], inplace=True)
# # test_df.drop(columns=['id', 'busway', 'lanes_forward', 'lanes_backward'], inplace=True)