In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import joblib
import time
import gc  # Garbage collector for memory management

In [12]:
# Record start time
start_time = time.time()


In [13]:
# Memory usage monitoring function
def check_memory_usage(step_name=""):
    import psutil
    process = psutil.Process()
    memory_info = process.memory_info()
    memory_mb = memory_info.rss / 1024 / 1024
    print(f"Memory usage ({step_name}): {memory_mb:.2f} MB")

print("Loading data...")
try:
    # Load data in chunks if file is large
    df = pd.read_csv("/content/drive/MyDrive/proyectoDS/data/all_waiting_times.csv")
    metadata = pd.read_csv("/content/drive/MyDrive/proyectoDS/data/overview data/metadata.csv")
    check_memory_usage("after loading")
except Exception as e:
    print(f"Error loading data: {e}")
    # Alternative approach with chunking if needed
    # df = pd.read_csv("all_waiting_times.csv", chunksize=100000)
    # df = pd.concat([chunk for chunk in df])

Loading data...
Memory usage (after loading): 3634.49 MB


In [14]:
# Feature Engineering - focused and memory-efficient
print("Performing feature engineering...")
df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")

# Basic time features - essential ones only
df["hour"] = df["datetime"].dt.hour
df["minute"] = df["datetime"].dt.minute
df["weekday"] = df["datetime"].dt.day_name()
df["month"] = df["datetime"].dt.month
df["is_weekend"] = df["datetime"].dt.dayofweek >= 5
df["date"] = df["datetime"].dt.date  # Create date column for merging

# Most important cyclical features only
df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

Performing feature engineering...


In [15]:
# Free memory - but don't delete datetime yet
gc.collect()
check_memory_usage("after feature engineering")

Memory usage (after feature engineering): 5100.02 MB


In [16]:
# Merge only essential metadata
metadata["DATE"] = pd.to_datetime(metadata["DATE"]).dt.date
df = df.merge(
    metadata[["DATE", "HOLIDAYM"]], left_on="date", right_on="DATE", how="left"
)
df["HOLIDAYM"] = df["HOLIDAYM"].fillna(0)

# Free memory from unused columns
del df["DATE"]
del df["date"]
del df["datetime"]  # Now it's safe to delete datetime
gc.collect()
check_memory_usage("after metadata merge")

# Handle attraction popularity efficiently
attraction_avg_wait = df.groupby("attraction")["SPOSTMIN"].mean().reset_index()
attraction_avg_wait.columns = ["attraction", "avg_historical_wait"]
df = df.merge(attraction_avg_wait, on="attraction", how="left")

Memory usage (after metadata merge): 4294.12 MB


In [17]:
# Filter invalid data
print("Filtering data...")
df = df[
    (df["SPOSTMIN"].notna()) &
    (df["SPOSTMIN"] >= 0) &
    (df["SPOSTMIN"] < 300)
]
check_memory_usage("after filtering")

Filtering data...
Memory usage (after filtering): 4722.98 MB


In [18]:
# One-hot encode categorical features - use sparse matrices for efficiency
print("Encoding categorical features...")
df_encoded = pd.get_dummies(df, columns=["weekday", "attraction"], sparse=True)

# Prepare features and target
print("Preparing features...")
feature_cols = [
    "hour", "minute", "month", "HOLIDAYM", "is_weekend",
    "hour_sin", "hour_cos", "avg_historical_wait"
]

Encoding categorical features...
Preparing features...


In [19]:
# Add encoded columns
encoded_cols = [col for col in df_encoded.columns if col.startswith(("weekday_", "attraction_"))]
all_feature_cols = feature_cols + encoded_cols

# Split data
X = df_encoded[all_feature_cols]
y = df_encoded["SPOSTMIN"]

# Free original dataframe memory
del df
del df_encoded
gc.collect()
check_memory_usage("after preparing features")

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

Memory usage (after preparing features): 4983.21 MB


In [20]:
# Define model with efficient parameters
print("Training model...")
model = GradientBoostingRegressor(
    n_estimators=100,  # Fewer trees for memory efficiency
    learning_rate=0.1,
    max_depth=5,       # Moderate depth
    subsample=0.8,     # Use 80% of samples for each tree (reduces memory)
    max_features=0.8,  # Use 80% of features for each tree (reduces memory)
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    verbose=1          # Show progress
)

Training model...


In [21]:
# Scale features for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train model
model.fit(X_train_scaled, y_train)
check_memory_usage("after model training")

# Evaluate model
print("Evaluating model...")
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R2 Score: {r2:.4f}")

# Feature importance - safely
try:
    if hasattr(model, "feature_importances_"):
        feature_names = list(X.columns)
        importances = model.feature_importances_

        # Ensure lengths match
        if len(importances) == len(feature_names):
            indices = np.argsort(importances)[::-1]

            print("\nTop 10 Important Features:")
            for i in range(min(10, len(indices))):
                print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
except Exception as e:
    print(f"Error getting feature importance: {e}")
check_memory_usage("after evaluation")

  if np.may_share_memory(array, array_orig):
  if np.may_share_memory(array, array_orig):


      Iter       Train Loss      OOB Improve   Remaining Time 
         1         833.9289          77.6198           53.63m
         2         769.3780          60.9297           53.67m
         3         717.6540          55.3624           52.31m
         4         674.0209          41.5545           51.49m
         5         641.6699          35.6092           51.71m
         6         610.4868          29.6070           50.97m
         7         580.7703          28.9713           50.10m
         8         557.1795          24.4875           49.29m
         9         536.5068          18.6482           48.96m
        10         518.3351          21.4314           48.31m
        20         429.3569           4.0485           42.75m
        30         393.2911           2.2183           38.28m
        40         376.5435           1.8852           33.50m
        50         365.0206          -0.2102           28.20m
        60         357.7319           0.7215           22.71m
       

In [22]:
# Visualize without keeping too many points in memory
plt.figure(figsize=(10, 8))
# Plot smaller samples if dataset is large
if len(y_test) > 5000:
    sample_indices = np.random.choice(len(y_test), 5000, replace=False)
    plt.scatter(y_test.iloc[sample_indices], y_pred[sample_indices], alpha=0.3)
else:
    plt.scatter(y_test, y_pred, alpha=0.3)
plt.xlabel("Actual SPOSTMIN")
plt.ylabel("Predicted SPOSTMIN")
plt.title("Predicted vs Actual Posted Wait Times - Memory-Efficient Model")
plt.plot([0, 300], [0, 300], color="red", linestyle="--")
plt.savefig("memory_efficient_model_predictions.png")
plt.close()

In [23]:
# Error Distribution - with sampling for large datasets
errors = y_pred - y_test
plt.figure(figsize=(10, 6))
if len(errors) > 10000:
    sampled_errors = np.random.choice(errors, 10000, replace=False)
    plt.hist(sampled_errors, bins=50)
else:
    plt.hist(errors, bins=50)
plt.xlabel("Prediction Error")
plt.ylabel("Frequency")
plt.title("Error Distribution")
plt.axvline(x=0, color="r", linestyle="-")
plt.savefig("memory_efficient_error_distribution.png")
plt.close()
check_memory_usage("after plotting")

Memory usage (after plotting): 8203.17 MB


In [24]:
# Save the model and scaler
print("Saving model...")
joblib.dump(model, "memory_efficient_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(X.columns), "model_features.pkl")

Saving model...


['model_features.pkl']

In [25]:
# Calculate and print execution time
end_time = time.time()
execution_time = end_time - start_time
print(f"\nExecution time: {execution_time:.2f} seconds ({execution_time/60:.2f} minutes)")
check_memory_usage("end of script")


Execution time: 4074.45 seconds (67.91 minutes)
Memory usage (end of script): 8203.17 MB
