<a href="https://colab.research.google.com/github/daniraymundo/traffic-volume-regression/blob/main/regression_model_traffic_volume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installing PyCaret and necessary libraries and loading the data

In [None]:
# Install the PyCaret library
!pip install pycaret --quiet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the traffic dataset and check the first 5 rows
from pycaret.datasets import get_data
dataset = get_data('traffic')
dataset.head()

# Data exploration and pre-processing

In [None]:
dataset.info()

In [None]:
# Check the shape of data
dataset.shape

In [None]:
# Check for duplicates
dataset.duplicated().sum()

In [None]:
# Remove duplicates
dataset=dataset.drop_duplicates()
dataset.shape

In [None]:
# Check for missing values per column
dataset.isnull().sum()

In [None]:
# Check unique values of the holiday column
dataset['holiday'].unique()

In [None]:
# Replace all non-null values in the holiday column with 1, else 0
dataset['holiday'] = dataset['holiday'].notnull().astype(int)

In [None]:
# Verify the changes by checking the unique values in the holiday column and their counts
dataset['holiday'].value_counts()

In [None]:
# Rename columns
dataset = dataset.rename(columns={'holiday': 'is_holiday', 'Rush Hour': 'is_rush_hour'})

# Display the first 5 rows to verify the changes
dataset.head()


In [None]:
# Check the statistical distribution of data
dataset.describe()

In [None]:
# Check the value counts of rain levels
dataset['rain_1h'].value_counts(normalize=True)

In [None]:
# Check the frequency of snow levels
dataset['snow_1h'].value_counts(normalize=True)

In [None]:
# Convert rain_1h and snow_1h to binary: 1 if any value > 0, else 0 to simplify the model since > 90% of values are 0
dataset['rain_1h'] = (dataset['rain_1h'] > 0).astype(int)
dataset['snow_1h'] = (dataset['snow_1h'] > 0).astype(int)

In [None]:
# Re-categorize weather conditions to 3 categories for simplicity

# Define a dictionary to categorize the weather conditions
weather_mapping = {
    'Clear': 'Normal',
    'Clouds': 'Normal',
    'Rain': 'Precipitation',
    'Drizzle': 'Precipitation',
    'Thunderstorm': 'Precipitation',
    'Snow': 'Precipitation',
    'Squall': 'Precipitation',
    'Mist': 'Low_Visibility',
    'Haze': 'Low_Visibility',
    'Fog': 'Low_Visibility',
    'Smoke': 'Low_Visibility'
}

# Apply the mapping to the 'weather_main' column
dataset['weather_category'] = dataset['weather_main'].map(weather_mapping)

# Check the new categories
print(dataset[['weather_main', 'weather_category']].tail(50))

In [None]:
# Check lowest values of the target
dataset['traffic_volume'].value_counts().sort_index().head(50)

In [None]:
# Plot the distribution of traffic_volume

plt.figure(figsize=(8, 6))
sns.boxplot(x=dataset['traffic_volume'])
plt.title('Boxplot of Traffic Volume')
plt.show()


In [None]:
# Create a copy of the dataset
data1=dataset.copy()

In [None]:
# Count rows where traffic_volume is less than or equal to 1000
len(data1[data1['traffic_volume'] <= 1000])


In [None]:
# Define bins and bin labels for traffic volume
bins = [0, 1000, 3000, 5000, 7500]
labels = ['0–1000 (Low)', '1001–3000 (Moderate)', '3001–5000 (Busy)', '5001+ (Very Busy)']

# Bin the traffic volume
data1['traffic_volume_bin'] = pd.cut(data1['traffic_volume'], bins=bins, labels=labels, right=True, include_lowest=True)

# Calculate distribution
distribution = data1['traffic_volume_bin'].value_counts().sort_index()

distribution1 = data1['traffic_volume_bin'].value_counts(normalize=True).sort_index() * 100

# Display the result
distribution_df = pd.DataFrame({
    'Traffic Volume Range': distribution.index,
    'Count' : distribution.values,
    'Percentage of Hours (%)': distribution1.values
})

(distribution_df)

In [None]:
# Plot counts of binned traffic volume

plt.figure(figsize=(10, 6))
sns.countplot(x='traffic_volume_bin', data=data1)
plt.title('Distribution of Traffic Volume Categories')
plt.xlabel('Traffic Volume Category')
plt.ylabel('Number of Hours')
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
# Filter the dataset to exclude observations where traffic volume <= 1000
filtered_data = dataset[dataset['traffic_volume'] > 1000].copy()

# Print the shape to show the changes
print(f"Original data size: {dataset.shape}")
print(f"Filtered data size: {filtered_data.shape}")

In [None]:
# Check info
filtered_data.info()

In [None]:
# Check the updated descriptive statistics of the filtered dataset
filtered_data.describe()

In [None]:
# Plot histogram of the original dataset and filtered dataset to visualize changes in the distribution

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(dataset['traffic_volume'], bins=20, kde=True)
plt.title('Traffic Volume Histogram (Original Dataset)')
plt.xlabel('Traffic Volume')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.histplot(filtered_data['traffic_volume'], bins= 20, kde=True)
plt.title('Traffic Volume Histogram (Filtered Dataset)')
plt.xlabel('Traffic Volume')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


# Sampling 5% of data as unseen data

In [None]:
# Sample 5% of data as unseen data
data = filtered_data.sample(frac=0.95, random_state=42 )
data_unseen = filtered_data.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)

# Setting up the environment in PyCaret

In [None]:
# Initialize setup
from pycaret.regression import *
setup(data = filtered_data,
          target = 'traffic_volume',
          categorical_features=['is_holiday', 'is_rush_hour', 'rain_1h', 'snow_1h'],
          numeric_features=['temp', 'clouds_all'],
          ordinal_features={'weather_category' : ['Normal', 'Low_Visibility', 'Precipitation']},
          ignore_features=['weather_main'],
          remove_outliers=True,
          remove_multicollinearity=True,
          normalize=True,
          normalize_method='minmax',
          session_id = 123)

In [None]:
# Display the variables created during setup
get_config()

In [None]:
# View the transformed dataset after setup
get_config('dataset_transformed').head(10)

In [None]:
# Show info of the transformed dataset
get_config('dataset_transformed').info()

In [None]:
# Display the descriptive statistics of the transformed dataset
get_config('dataset_transformed').describe()

# Evaluating different models

In [None]:
# Compare the different models
best_model = compare_models(sort='MAPE')

# Creating the model

In [None]:
# Create the lightgbm model (chosen for best overall metrics)
lightgbm = create_model('lightgbm')

In [None]:
# Create the rf model (chosen for lowest MAPE)
rf = create_model('rf')

# Tuning the model

In [None]:
# Tune the model
tuned_lightgbm = tune_model(lightgbm,
                            optimize='MAPE')

In [None]:
tuned_rf=tune_model(rf,
                    optimize='MAPE',
                    fold=5)

# Blending the top 2 models

In [None]:
blended_model = blend_models(estimator_list=[tuned_lightgbm, rf], optimize='MAPE')

# Evaluation and Visualization

In [None]:
# Evaluate the blended model
evaluate_model(blended_model)

In [None]:
# Plot feature importance of lightgbm
plot_model(tuned_lightgbm, plot='feature')

In [None]:
# Plot feature importance of rf
plot_model(rf, plot='feature')

# Making predictions

In [None]:
# Finalize the model and predict on the unseen data
final_model = finalize_model(blended_model) # Retrains on entire training data
predictions= predict_model(final_model, data=data_unseen)

In [None]:
# Display the first few rows of the data containing predictions
predictions.head(10)

In [None]:
# Plot actual traffic volume vs predicted traffic volume

plt.figure(figsize=(10, 6))
plt.scatter(predictions['traffic_volume'], predictions['prediction_label'])
plt.xlabel('Traffic Volume')
plt.ylabel('Predicted Traffic Volume')
plt.title('Traffic Volume vs. Prediction')
plt.show()
