In [257]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [260]:
def load_data(folder_path):
    dataframes = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            category = os.path.splitext(filename)[0]
            df['Category'] = category
            dataframes.append(df)
    if dataframes:
        merged_data = pd.concat(dataframes, ignore_index=True)
    else:
        print("No CSV files found in the specified folder.")
        return pd.DataFrame()

    return merged_data

In [284]:
def load_data2(base_folder_path):
    dataframes = []
    
    for category in os.listdir(base_folder_path):
        category_path = os.path.join(base_folder_path, category)
        
        if os.path.isdir(category_path):
            for filename in os.listdir(category_path):
                if filename.endswith('.csv'):
                    file_path = os.path.join(category_path, filename)
                    df = pd.read_csv(file_path)
                    df['Category'] = category
                    province = os.path.splitext(filename)[0]
                    df['Province'] = province
                    price_columns = df.columns.difference(['Date', 'Category', 'Province'])
                    if not price_columns.empty:
                        df['Price'] = df[price_columns[0]]
                    else:
                        print(f"No price column found in {filename}. Skipping this file.")
                        continue
                    
                    dataframes.append(df)
    
    if dataframes:
        merged_data = pd.concat(dataframes, ignore_index=True)
    else:
        print("No CSV files found in the specified folder.")
        return pd.DataFrame()
    merged_data2 = merged_data.pivot_table(index=['Date', 'Category'], columns='Province', values='Price', aggfunc='first')

    merged_data2.reset_index(inplace=True)
    merged_data2['Category'] = merged_data2['Category'].replace({
    'bawang merah': 'Bawang Merah',
    'bawang putih': 'Bawang Putih Bonggol',
    'cabai merah': 'Cabai Merah Keriting',
    'cabai rawit': 'Cabai Rawit Merah',
    'daging ayam': 'Daging Ayam Ras',
    'daging sapi': 'Daging Sapi Murni',
    'gula': 'Gula Konsumsi',
    'telur ayam': 'Telur Ayam Ras',
    'tepung terigu': 'Tepung Terigu (Curah)',
})
    return merged_data2


In [262]:
def melt(df, value_name):
    melted = df.melt(id_vars=['Date', 'Category'], var_name='Province', value_name=value_name)
    return melted

In [286]:
def merge(df1, df2):
    merged_data = pd.merge(df1, df2, on=['Date', 'Category', 'Province'], how='left')
    for date in df2['Date'].unique():
        beras_quantity = df2.loc[df2['Category'] == 'beras', 'Quantity'].loc[df2['Date'] == date].sum()
    for category in ['Beras Medium', 'Beras Premium']:
        merged_data.loc[(merged_data['Category'] == category) & (merged_data['Date'] == date), 'Quantity'] += beras_quantity
    return merged_data

In [288]:
def pipeline(folder_path, base_folder_path):
    data = load_data(folder_path)
    data2 = load_data2(base_folder_path)
    data_melted = melt(data, 'Price')
    data2_melted = melt(data2, 'Quantity')
    data_merged = merge(data_melted, data2_melted)
    return data_merged

In [None]:
folder_path = r'C:\Users\farel\OneDrive\Documents\Arkavidia 9\Harga Bahan Pangan\train'
folder_path2 = r'C:\Users\farel\OneDrive\Documents\Arkavidia 9\Google Trend'
train = pipeline(folder_path, folder_path2)

In [290]:
from sklearn.model_selection import train_test_split
train['Date'] = pd.to_datetime(train['Date'])
train['Date'] =train['Date'].astype(int) // 10**9
train['Category'] = train['Category'].astype('category')
train['Province'] = train['Province'].astype('category')
train = train.dropna(subset=['Price'])
X = train.drop(columns=['Price'])
y = train['Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [291]:
import numpy as np

def mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    """
    Calculate the Mean Absolute Percentage Error (MAPE).

    Parameters:
    y_true (array-like): True values.
    y_pred (array-like): Predicted values.

    Returns:
    float: The calculated MAPE.
    """
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [292]:
import xgboost as xgb

model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10, enable_categorical=True)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [293]:
mape(y_test, y_pred)

np.float64(61.01411985040096)

In [279]:
y_pred

array([36577.33 , 30581.002, 39282.984, ..., 26488.398, 26657.238,
       29378.229], shape=(84518,), dtype=float32)

In [294]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001550 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 558
[LightGBM] [Info] Number of data points in the train set: 338071, number of used features: 4
[LightGBM] [Info] Start training from score 36140.583753


In [295]:
mape(y_test, y_pred)

np.float64(5.309832040881665)

In [296]:
folder_test = r'C:\Users\farel\OneDrive\Documents\Arkavidia 9\Harga Bahan Pangan\test'
test = pipeline(folder_test, folder_path2)

  merged_data = pd.concat(dataframes, ignore_index=True)


In [305]:
import catboost as cb

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject