In [21]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.ensemble import HistGradientBoostingRegressor

In [191]:
def load_data(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            category = os.path.splitext(filename)[0]
            df['Category'] = category
            dataframes.append(df)
    if dataframes:
        merged_data = pd.concat(dataframes, ignore_index=True)
        merged_data['Date'] = pd.to_datetime(merged_data['Date'])
    else:
        print("No CSV files found in the specified folder.")
        return pd.DataFrame()

    return merged_data

In [12]:
def load_data2(base_folder_path):
    dataframes = []
    
    for category in os.listdir(base_folder_path):
        category_path = os.path.join(base_folder_path, category)
        
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                if filename.endswith('.csv'):
                    file_path = os.path.join(category_path, filename)
                    df = pd.read_csv(file_path)
                    df['Category'] = category
                    province = os.path.splitext(filename)[0]
                    df['Province'] = province
                    price_columns = df.columns.difference(['Date', 'Category', 'Province'])
                    if not price_columns.empty:
                        df['Price'] = df[price_columns[0]]
                    else:
                        print(f"No price column found in {filename}. Skipping this file.")
                        continue
                    
                    dataframes.append(df)
    
    if dataframes:
        merged_data = pd.concat(dataframes, ignore_index=True)
        merged_data['Date'] = pd.to_datetime(merged_data['Date'])
    else:
        print("No CSV files found in the specified folder.")
        return pd.DataFrame()
    merged_data2 = merged_data.pivot_table(index=['Date', 'Category'], columns='Province', values='Price', aggfunc='first')

    merged_data2.reset_index(inplace=True)
    merged_data2['Category'] = merged_data2['Category'].replace({
    'bawang merah': 'Bawang Merah',
    'bawang putih': 'Bawang Putih Bonggol',
    'cabai merah': 'Cabai Merah Keriting',
    'cabai rawit': 'Cabai Rawit Merah',
    'daging ayam': 'Daging Ayam Ras',
    'daging sapi': 'Daging Sapi Murni',
    'gula': 'Gula Konsumsi',
    'telur ayam': 'Telur Ayam Ras',
    'tepung terigu': 'Tepung Terigu (Curah)',
})
    return merged_data2


In [13]:
def google_trend(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            df['Date'] = pd.to_datetime(df['Date'])
            df['Category'] = filename[:-4]
            df.rename(columns={'Riau Islands': 'Kepulauan Riau', 'Bangka Belitung Islands' : 'Kepulauan Bangka Belitung',
                               'Sumatra Barat' : 'Sumatera Barat', 'Sumatra Utara' : 'Sumatera Utara',
                               'Sumatra Selatan' : 'Sumatera Selatan'}, inplace=True)
            dataframes.append(df)

    if dataframes:
        merged_df = pd.concat(dataframes, ignore_index=True)
        merged_df['Category'] = merged_df['Category'].replace({
    'bawang merah': 'Bawang Merah',
    'bawang putih': 'Bawang Putih Bonggol',
    'cabai merah': 'Cabai Merah Keriting',
    'cabai rawit': 'Cabai Rawit Merah',
    'daging ayam': 'Daging Ayam Ras',
    'daging sapi': 'Daging Sapi Murni',
    'gula': 'Gula Konsumsi',
    'telur ayam': 'Telur Ayam Ras',
    'tepung terigu': 'Tepung Terigu (Curah)',
})
        return merged_df
    else:
        print("No CSV files found in the specified folder.")
        return None


In [130]:
def fillna(df):
    missing_percentage = df.isnull().sum() / len(df) * 100
    for column in df.columns:
        # Check if the percentage of missing values is less than 10%
        if missing_percentage[column] < 10:
            # Forward fill for less than 10% missing values
            # df[column] = df[column].fillna(method='ffill')
            df[column] = df[column].fillna(0)
        elif missing_percentage[column] < 50:
            # # Check for NaNs in the first rows
            # if df[column].isnull().any():
            #     # Forward fill initial NaNs
            #     df[column] = df[column].fillna(method='ffill')
            
            # # Now apply Exponential Smoothing
            # # Ensure there are enough non-null values to fit the model
            # if df[column].notnull().sum() >= 10:  # Check for at least 10 non-null values
            #     model = ExponentialSmoothing(df[column], trend='add', seasonal='add', seasonal_periods=7, initialization_method='estimated')
            #     model_fit = model.fit()
            #     df[column] = model_fit.fittedvalues
            # else:
            #     # If not enough data, you can choose to fill with mean or drop
            #     mean_value = df[column].mean()
            #     df[column] = df[column].fillna(mean_value)
            # df[column] = df[column].fillna(method='ffill')
            df[column] = df[column].fillna(0)
            # df[column] = df[column].interpolate(method='akima')
        # else:
        #     df[column] = df[column].fillna(method='ffill')
    
    return df

In [15]:
def melt(df, value_name):
    melted = df.melt(id_vars=['Date', 'Category'], var_name='Province', value_name=value_name)
    return melted

In [16]:
import pandas as pd

def merge(df1, df2):
    merged_data = pd.merge(df1, df2, on=['Date', 'Category', 'Province'], how='left')
    quantities = df2[df2['Category'].isin(['beras', 'minyak goreng'])].groupby(['Date', 'Province', 'Category'])['Quantity'].sum().unstack(fill_value=0)
    quantities.columns = ['beras', 'minyak_goreng']
    merged_data = merged_data.merge(quantities, on=['Date', 'Province'], how='left')
    merged_data.loc[merged_data['Category'] == 'Beras Medium', 'Quantity'] = merged_data['beras']
    merged_data.loc[merged_data['Category'] == 'Beras Premium', 'Quantity'] = merged_data['beras']
    merged_data.loc[merged_data['Category'] == 'Minyak Goreng Curah', 'Quantity'] = merged_data['minyak_goreng']
    merged_data.loc[merged_data['Category'] == 'Minyak Goreng Kemasan Sederhana', 'Quantity'] = merged_data['minyak_goreng']
    merged_data.drop(columns=['beras', 'minyak_goreng'], inplace=True)

    return merged_data

In [158]:
def one_hot_encode(df, column):
    """
    One-hot encodes a categorical feature in a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the categorical feature.
    column (str): The name of the column to be one-hot encoded.

    Returns:
    pd.DataFrame: The DataFrame with the one-hot encoded features.
    """
    # Perform one-hot encoding
    one_hot_encoded_df = pd.get_dummies(df, columns=[column], drop_first=True)
    return one_hot_encoded_df

In [177]:
def pipeline(folder_path, base_folder_path, isTrain):
    data = load_data(folder_path)
    if isTrain:
        data2 = load_data2(base_folder_path)
        data2 = fillna(data2)
    else:
        data2 = google_trend(base_folder_path)
    data_melted = melt(data, 'Price')
    data2_melted = melt(data2, 'Quantity')
    data_merged = merge(data_melted, data2_melted)
    # data_merged = one_hot_encode(data_merged, 'Province')
    return data_merged

In [183]:
folder_path = r'C:\Users\farel\OneDrive\Documents\GitHub\Arkavidia-9\Harga Bahan Pangan\train'
folder_path2 = r'C:\Users\farel\OneDrive\Documents\GitHub\Arkavidia-9\Google Trend'
train = pipeline(folder_path, folder_path2, True)

  merged_data = pd.concat(dataframes, ignore_index=True)


In [180]:
from sklearn.model_selection import train_test_split
train['Date'] = pd.to_datetime(train['Date'])
train['Date'] =train['Date'].astype('int64') // 10**9
train['Category'] = train['Category'].astype('category')
train['Province'] = train['Province'].astype('category')
train = train.dropna(subset=['Price'])
X = train.drop(columns=['Price', 'Quantity'])
y = train['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
import numpy as np

def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Calculate the Mean Absolute Percentage Error (MAPE).

    Parameters:
    y_true (array-like): True values.
    y_pred (array-like): Predicted values.

    Returns:
    float: The calculated MAPE.
    """
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [103]:
import xgboost as xgb

model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, enable_categorical=True)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [104]:
mape(y_test, y_pred)

60.451369130362146

In [17]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 559
[LightGBM] [Info] Number of data points in the train set: 338071, number of used features: 4
[LightGBM] [Info] Start training from score 36140.583753


In [205]:
mape(y_test, y_pred)

5.2722734405020555

In [None]:
#hist gradient boosting
from sklearn.experimental import enable_hist_gradient_boosting


model = HistGradientBoostingRegressor(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mape(y_test, y_pred)

5.266117540446549

In [20]:
from skopt import BayesSearchCV
from sklearn.model_selection import KFold  # Change this line

# Define your model
model = LGBMRegressor(random_state=42)

# Define the parameter space for Bayesian optimization
param_space = {
    'n_estimators': (10, 100),
    'max_depth': (1, 10),
    'learning_rate': (0.01, 1.0, 'log-uniform'),
    'num_leaves': (2, 50),
    'min_child_samples': (1, 20),
    'subsample': (0.05, 1.0),
    'colsample_bytree': (0.1, 1.0),
}

# Use KFold for regression tasks
opt = BayesSearchCV(model, param_space, n_iter=32, cv=KFold(n_splits=5), n_jobs=-1, random_state=42)

# Fit the model
opt.fit(X_train, y_train)

# Make predictions
y_pred = opt.predict(X_test)

# Calculate MAPE (Mean Absolute Percentage Error)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'MAPE: {mape:.2f}%')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001919 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 559
[LightGBM] [Info] Number of data points in the train set: 338071, number of used features: 4
[LightGBM] [Info] Start training from score 36140.583753
MAPE: 3.53%


In [21]:
best_params = opt.best_params_
print("Best Parameters:", best_params)

# Optionally, create a new model with the best parameters
best_model = LGBMRegressor(**best_params, random_state=42)

# Fit the best model on the training data (if needed)
best_model.fit(X_train, y_train)

# Make predictions with the best model
y_pred_best_model = best_model.predict(X_test)

# Calculate MAPE for the best model
mape_best_model = np.mean(np.abs((y_test - y_pred_best_model) / y_test)) * 100
print(f'MAPE with Best Model: {mape_best_model:.2f}%')

Best Parameters: OrderedDict([('colsample_bytree', 1.0), ('learning_rate', 0.4319256222176345), ('max_depth', 10), ('min_child_samples', 20), ('n_estimators', 70), ('num_leaves', 50), ('subsample', 0.21727297425491027)])
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 559
[LightGBM] [Info] Number of data points in the train set: 338071, number of used features: 4
[LightGBM] [Info] Start training from score 36140.583753
MAPE with Best Model: 3.53%


In [76]:
from skopt import BayesSearchCV
from sklearn.model_selection import KFold
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
import numpy as np

model = HistGradientBoostingRegressor(random_state=42)
param_space = {
    'max_iter': (100, 1000),
    'max_depth': (1, 10),
    'learning_rate': (0.01, 1.0, 'log-uniform'),
    'min_samples_leaf': (1, 20),
    'l2_regularization': (0.0, 10.0)
}
opt = BayesSearchCV(model, param_space, n_iter=32, cv=KFold(n_splits=5), n_jobs=-1, random_state=42)
opt.fit(X_train, y_train)

y_pred = opt.predict(X_test)

mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'MAPE: {mape:.2f}%')

best_params = opt.best_params_
print("Best Parameters:", best_params)

best_model = HistGradientBoostingRegressor(**best_params, random_state=42)

best_model.fit(X_train, y_train)
y_pred_best_model = best_model.predict(X_test)

mape_best_model = np.mean(np.abs((y_test - y_pred_best_model) / y_test)) * 100
print(f'MAPE with Best Model: {mape_best_model:.2f}%')

MAPE: 2.12%
Best Parameters: OrderedDict([('l2_regularization', 10.0), ('learning_rate', 0.420324924561727), ('max_depth', 7), ('max_iter', 1000), ('min_samples_leaf', 20)])
MAPE with Best Model: 2.12%


In [181]:
model = HistGradientBoostingRegressor(l2_regularization=10.0, learning_rate=0.420324924561727,
                                      max_depth=7, max_iter=1000, min_samples_leaf=20, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'MAPE: {mape:}%')

MAPE: 2.099736331028919%


In [148]:
#feature importances
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42)

# Create a DataFrame for feature importances
importances = result.importances_mean
feature_names = X_train.columns
indices = np.argsort(importances)[::-1]

# Display feature importances
for i in range(len(importances)):
    print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")

Category: 1.9571
Province: 0.0919
Date: 0.0673


In [77]:
y_pred = opt.predict(X_test)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
print(f'MAPE: {mape:.2f}%')

MAPE: 2.12%


In [134]:
folder_test = r'C:\Users\farel\OneDrive\Documents\GitHub\Arkavidia-9\Harga Bahan Pangan\test'
folder_path2 = r'C:\Users\farel\OneDrive\Documents\GitHub\Arkavidia-9\google_trend'
test = pipeline(folder_test, folder_path2, False)

In [135]:
test['Date'] = pd.to_datetime(test['Date'])
test['Date'] =test['Date'].astype('int64') // 10**9
test['Category'] = test['Category'].astype('category')
test['Province'] = test['Province'].astype('category')

In [136]:
testing = test.drop(columns=['Price'])
test['Price'] = model.predict(testing)
test['Date'] = pd.to_datetime(test['Date'], unit='s')
test['id'] = test['Category'].astype(str) + '/' + test['Province'].astype(str) + '/' + test['Date'].astype(str)

In [137]:
prediction = test[['id', 'Price']]

In [138]:
sample_submission = pd.read_csv(r'C:\Users\farel\OneDrive\Documents\GitHub\Arkavidia-9\Harga Bahan Pangan\sample_submission.csv')

In [139]:
submission = prediction.set_index('id').reindex(sample_submission['id']).reset_index()

In [140]:
submission.to_csv('submission8.csv', index=False)

In [141]:
sample_submission

Unnamed: 0,id,price
0,Bawang Merah/Aceh/2024-10-01,0
1,Bawang Merah/Aceh/2024-10-02,0
2,Bawang Merah/Aceh/2024-10-03,0
3,Bawang Merah/Aceh/2024-10-04,0
4,Bawang Merah/Aceh/2024-10-05,0
...,...,...
40659,Tepung Terigu (Curah)/Sumatera Utara/2024-12-27,0
40660,Tepung Terigu (Curah)/Sumatera Utara/2024-12-28,0
40661,Tepung Terigu (Curah)/Sumatera Utara/2024-12-29,0
40662,Tepung Terigu (Curah)/Sumatera Utara/2024-12-30,0
