In [4]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import(
    mean_absolute_error as MAE,
    mean_squared_error as MSE,
    mean_absolute_percentage_error as MAPE,

)

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.compose import ColumnTransformer


import joblib
import warnings
warnings.filterwarnings('ignore')

In [5]:
# import data
data_name_june = "yellow_tripdata_2025-06.parquet"
raw_data_url_june = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-06.parquet"

# Download file terlebih dahulu
response = requests.get(raw_data_url_june)
with open(f"../data/{data_name_june}", "wb") as f:
    f.write(response.content)

# Baca file parquet dari lokal
df_june = pd.read_parquet(f"../data/{data_name_june}")

trip_data_06 = df_june



In [6]:
# Feature and target selection
features = trip_data_06[['passenger_count', 'trip_distance', 'PULocationID', 'DOLocationID']]
target = trip_data_06['total_amount']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features,target,
                                         test_size=0.2,
                                         random_state=42)

# rows of features
print(f"Training data size: {X_train.shape[0]} samples")
print(f"Testing data size: {X_test.shape[0]} samples")


Training data size: 3458368 samples
Testing data size: 864592 samples


In [7]:

# Membuat pipeline preprocessing
preprocessing_pipeline = Pipeline([
    # Step 1: Handling missing values
    ('imputer', SimpleImputer(strategy='median')), 
    
    # Step 2: Removing outliers effect and scaling
    ('scaler', RobustScaler()),
])

In [8]:
from sklearn.model_selection import RandomizedSearchCV

# pipeline model
pipeline = Pipeline([
    ('prep', preprocessing_pipeline),
    ('algo', LinearRegression())
])

# Parameter grid 
param_grid = {
    # prep
    'prep__scaler__quantile_range': [(25.0, 75.0), (10.0, 90.0), (5.0, 95.0)],
    'prep__imputer__strategy': ['mean', 'median', 'most_frequent'],

    # algo
    'algo__fit_intercept': [True, False],
    'algo__positive': [True, False]
}

# RandomizedSearchCV (tuning hyperparameter model))
model = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,  # Number of parameter settings sampled
    cv=5,       # Number of cross-validation folds
    scoring='neg_mean_squared_error',
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Fit RandomizedSearchCV
model.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", model.best_params_)
print("Best MSE score:", -model.best_score_)

Best parameters: {'prep__scaler__quantile_range': (10.0, 90.0), 'prep__imputer__strategy': 'median', 'algo__positive': False, 'algo__fit_intercept': True}
Best MSE score: 31199.922590398568


In [9]:
# Prediksi pada data test
y_pred = model.predict(X_test)

# Evaluasi model regresi
mae = MAE(y_test, y_pred)
mse = MSE(y_test, y_pred)
rmse = np.sqrt(mse)
mape = MAPE(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.4f}")
print(f"R^2 Score: {r2:.4f}")

Mean Absolute Error (MAE): 14.55
Mean Squared Error (MSE): 569.23
Root Mean Squared Error (RMSE): 23.86
Mean Absolute Percentage Error (MAPE): 17962799434417.8203
R^2 Score: -569.2298
