# Distribution Fitting Analysis
## Fitting Statistical Distributions to Interarrival and Service Time Data

**Author:** fergmlx  
**Date:** 2025-12-09  
**Project:** Queue Theory Analysis

### Objectives:
1. Fit exponential, gamma, lognormal, and Weibull distributions to the data
2. Compare models using AIC and BIC criteria
3. Perform Kolmogorov-Smirnov goodness-of-fit tests
4. Perform Chi-square goodness-of-fit tests
5. Create visual comparisons with PDF overlays
6. Generate Q-Q plots for distribution validation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import kstest, chisquare, chi2
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

## 1. Load and Prepare Data

In [None]:
# Load the processed data
# Adjust the path according to your data location
df = pd.read_csv('../data/processed/queue_data.csv')

# Display basic information
print("Data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nData summary:")
print(df.describe())

In [None]:
# Extract interarrival and service times
# Adjust column names according to your dataset
interarrival_times = df['interarrival_time'].dropna().values
service_times = df['service_time'].dropna().values

print(f"Interarrival times: {len(interarrival_times)} observations")
print(f"Service times: {len(service_times)} observations")
print(f"\nInterarrival times - Mean: {np.mean(interarrival_times):.4f}, Std: {np.std(interarrival_times):.4f}")
print(f"Service times - Mean: {np.mean(service_times):.4f}, Std: {np.std(service_times):.4f}")

## 2. Define Distribution Fitting Functions

In [None]:
def fit_distributions(data, data_name="Data"):
    """
    Fit exponential, gamma, lognormal, and Weibull distributions to data.
    Returns dictionary with fitted parameters and distribution objects.
    """
    results = {}
    
    # Exponential distribution
    exp_params = stats.expon.fit(data, floc=0)
    results['exponential'] = {
        'params': exp_params,
        'dist': stats.expon(*exp_params),
        'name': 'Exponential',
        'param_names': ['loc', 'scale']
    }
    
    # Gamma distribution
    gamma_params = stats.gamma.fit(data, floc=0)
    results['gamma'] = {
        'params': gamma_params,
        'dist': stats.gamma(*gamma_params),
        'name': 'Gamma',
        'param_names': ['shape', 'loc', 'scale']
    }
    
    # Lognormal distribution
    lognorm_params = stats.lognorm.fit(data, floc=0)
    results['lognormal'] = {
        'params': lognorm_params,
        'dist': stats.lognorm(*lognorm_params),
        'name': 'Lognormal',
        'param_names': ['shape', 'loc', 'scale']
    }
    
    # Weibull distribution
    weibull_params = stats.weibull_min.fit(data, floc=0)
    results['weibull'] = {
        'params': weibull_params,
        'dist': stats.weibull_min(*weibull_params),
        'name': 'Weibull',
        'param_names': ['shape', 'loc', 'scale']
    }
    
    return results

In [None]:
def calculate_aic_bic(data, dist, params):
    """
    Calculate AIC and BIC for a fitted distribution.
    """
    n = len(data)
    k = len(params)  # number of parameters
    
    # Log-likelihood
    log_likelihood = np.sum(np.log(dist.pdf(data) + 1e-10))
    
    # AIC = 2k - 2ln(L)
    aic = 2 * k - 2 * log_likelihood
    
    # BIC = k*ln(n) - 2ln(L)
    bic = k * np.log(n) - 2 * log_likelihood
    
    return aic, bic, log_likelihood

In [None]:
def perform_ks_test(data, dist):
    """
    Perform Kolmogorov-Smirnov goodness-of-fit test.
    """
    ks_statistic, p_value = stats.kstest(data, dist.cdf)
    return ks_statistic, p_value

In [None]:
def perform_chi_square_test(data, dist, num_bins=10):
    """
    Perform Chi-square goodness-of-fit test.
    """
    # Create bins
    observed_freq, bin_edges = np.histogram(data, bins=num_bins)
    
    # Calculate expected frequencies
    expected_freq = []
    for i in range(len(bin_edges) - 1):
        prob = dist.cdf(bin_edges[i+1]) - dist.cdf(bin_edges[i])
        expected_freq.append(prob * len(data))
    
    expected_freq = np.array(expected_freq)
    
    # Combine bins with expected frequency < 5
    mask = expected_freq >= 5
    if np.sum(mask) < len(mask):
        observed_freq = np.array([np.sum(observed_freq[mask]), np.sum(observed_freq[~mask])])
        expected_freq = np.array([np.sum(expected_freq[mask]), np.sum(expected_freq[~mask])])
    
    # Perform chi-square test
    chi2_statistic = np.sum((observed_freq - expected_freq)**2 / expected_freq)
    degrees_of_freedom = len(observed_freq) - 1 - len(dist.args)
    
    if degrees_of_freedom > 0:
        p_value = 1 - chi2.cdf(chi2_statistic, degrees_of_freedom)
    else:
        p_value = np.nan
    
    return chi2_statistic, p_value, degrees_of_freedom

## 3. Fit Distributions to Interarrival Times

In [None]:
# Fit distributions to interarrival times
interarrival_fits = fit_distributions(interarrival_times, "Interarrival Times")

# Display fitted parameters
print("=" * 80)
print("INTERARRIVAL TIMES - FITTED PARAMETERS")
print("=" * 80)

for dist_name, fit_info in interarrival_fits.items():
    print(f"\n{fit_info['name']} Distribution:")
    for param_name, param_value in zip(fit_info['param_names'], fit_info['params']):
        print(f"  {param_name}: {param_value:.6f}")

In [None]:
# Calculate AIC and BIC for interarrival times
print("\n" + "=" * 80)
print("INTERARRIVAL TIMES - MODEL COMPARISON (AIC/BIC)")
print("=" * 80)

interarrival_comparison = []
for dist_name, fit_info in interarrival_fits.items():
    aic, bic, log_lik = calculate_aic_bic(interarrival_times, fit_info['dist'], fit_info['params'])
    interarrival_comparison.append({
        'Distribution': fit_info['name'],
        'Log-Likelihood': log_lik,
        'AIC': aic,
        'BIC': bic
    })

interarrival_comparison_df = pd.DataFrame(interarrival_comparison)
interarrival_comparison_df = interarrival_comparison_df.sort_values('AIC')
print("\n", interarrival_comparison_df.to_string(index=False))
print("\nBest model (lowest AIC):", interarrival_comparison_df.iloc[0]['Distribution'])

In [None]:
# Perform goodness-of-fit tests for interarrival times
print("\n" + "=" * 80)
print("INTERARRIVAL TIMES - GOODNESS-OF-FIT TESTS")
print("=" * 80)

interarrival_tests = []
for dist_name, fit_info in interarrival_fits.items():
    # Kolmogorov-Smirnov test
    ks_stat, ks_p = perform_ks_test(interarrival_times, fit_info['dist'])
    
    # Chi-square test
    chi2_stat, chi2_p, df = perform_chi_square_test(interarrival_times, fit_info['dist'])
    
    interarrival_tests.append({
        'Distribution': fit_info['name'],
        'KS Statistic': ks_stat,
        'KS p-value': ks_p,
        'Chi-square': chi2_stat,
        'Chi-square p-value': chi2_p
    })

interarrival_tests_df = pd.DataFrame(interarrival_tests)
print("\n", interarrival_tests_df.to_string(index=False))
print("\nNote: Higher p-values indicate better fit (fail to reject null hypothesis)")

## 4. Fit Distributions to Service Times

In [None]:
# Fit distributions to service times
service_fits = fit_distributions(service_times, "Service Times")

# Display fitted parameters
print("=" * 80)
print("SERVICE TIMES - FITTED PARAMETERS")
print("=" * 80)

for dist_name, fit_info in service_fits.items():
    print(f"\n{fit_info['name']} Distribution:")
    for param_name, param_value in zip(fit_info['param_names'], fit_info['params']):
        print(f"  {param_name}: {param_value:.6f}")

In [None]:
# Calculate AIC and BIC for service times
print("\n" + "=" * 80)
print("SERVICE TIMES - MODEL COMPARISON (AIC/BIC)")
print("=" * 80)

service_comparison = []
for dist_name, fit_info in service_fits.items():
    aic, bic, log_lik = calculate_aic_bic(service_times, fit_info['dist'], fit_info['params'])
    service_comparison.append({
        'Distribution': fit_info['name'],
        'Log-Likelihood': log_lik,
        'AIC': aic,
        'BIC': bic
    })

service_comparison_df = pd.DataFrame(service_comparison)
service_comparison_df = service_comparison_df.sort_values('AIC')
print("\n", service_comparison_df.to_string(index=False))
print("\nBest model (lowest AIC):", service_comparison_df.iloc[0]['Distribution'])

In [None]:
# Perform goodness-of-fit tests for service times
print("\n" + "=" * 80)
print("SERVICE TIMES - GOODNESS-OF-FIT TESTS")
print("=" * 80)

service_tests = []
for dist_name, fit_info in service_fits.items():
    # Kolmogorov-Smirnov test
    ks_stat, ks_p = perform_ks_test(service_times, fit_info['dist'])
    
    # Chi-square test
    chi2_stat, chi2_p, df = perform_chi_square_test(service_times, fit_info['dist'])
    
    service_tests.append({
        'Distribution': fit_info['name'],
        'KS Statistic': ks_stat,
        'KS p-value': ks_p,
        'Chi-square': chi2_stat,
        'Chi-square p-value': chi2_p
    })

service_tests_df = pd.DataFrame(service_tests)
print("\n", service_tests_df.to_string(index=False))
print("\nNote: Higher p-values indicate better fit (fail to reject null hypothesis)")

## 5. Visual Comparison - PDF Overlays

In [None]:
def plot_pdf_comparison(data, fits, title, bins=50):
    """
    Plot histogram of data with overlaid PDFs of fitted distributions.
    """
    fig, ax = plt.subplots(figsize=(14, 7))
    
    # Plot histogram
    ax.hist(data, bins=bins, density=True, alpha=0.6, color='skyblue', 
            edgecolor='black', label='Observed Data')
    
    # Generate x values for plotting PDFs
    x = np.linspace(data.min(), data.max(), 1000)
    
    # Plot PDFs
    colors = ['red', 'green', 'orange', 'purple']
    linestyles = ['-', '--', '-.', ':']
    
    for (dist_name, fit_info), color, linestyle in zip(fits.items(), colors, linestyles):
        pdf = fit_info['dist'].pdf(x)
        ax.plot(x, pdf, color=color, linestyle=linestyle, linewidth=2.5, 
                label=fit_info['name'])
    
    ax.set_xlabel('Time', fontsize=12, fontweight='bold')
    ax.set_ylabel('Probability Density', fontsize=12, fontweight='bold')
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.legend(loc='best', fontsize=11)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

In [None]:
# Plot PDF comparison for interarrival times
fig_interarrival = plot_pdf_comparison(
    interarrival_times, 
    interarrival_fits, 
    'Interarrival Times - Distribution Fitting Comparison'
)
plt.show()

In [None]:
# Plot PDF comparison for service times
fig_service = plot_pdf_comparison(
    service_times, 
    service_fits, 
    'Service Times - Distribution Fitting Comparison'
)
plt.show()

## 6. Q-Q Plots for Distribution Validation

In [None]:
def plot_qq_plots(data, fits, title):
    """
    Create Q-Q plots for all fitted distributions.
    """
    fig, axes = plt.subplots(2, 2, figsize=(14, 12))
    axes = axes.ravel()
    
    for idx, (dist_name, fit_info) in enumerate(fits.items()):
        ax = axes[idx]
        
        # Generate theoretical quantiles
        sorted_data = np.sort(data)
        n = len(sorted_data)
        theoretical_quantiles = fit_info['dist'].ppf(np.linspace(0.01, 0.99, n))
        
        # Plot Q-Q
        ax.scatter(theoretical_quantiles, sorted_data, alpha=0.6, s=20)
        
        # Plot diagonal reference line
        min_val = min(theoretical_quantiles.min(), sorted_data.min())
        max_val = max(theoretical_quantiles.max(), sorted_data.max())
        ax.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect Fit')
        
        ax.set_xlabel('Theoretical Quantiles', fontsize=11, fontweight='bold')
        ax.set_ylabel('Sample Quantiles', fontsize=11, fontweight='bold')
        ax.set_title(f'{fit_info["name"]} Distribution', fontsize=12, fontweight='bold')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    fig.suptitle(title, fontsize=14, fontweight='bold', y=1.00)
    plt.tight_layout()
    return fig

In [None]:
# Create Q-Q plots for interarrival times
fig_qq_interarrival = plot_qq_plots(
    interarrival_times, 
    interarrival_fits, 
    'Interarrival Times - Q-Q Plots'
)
plt.show()

In [None]:
# Create Q-Q plots for service times
fig_qq_service = plot_qq_plots(
    service_times, 
    service_fits, 
    'Service Times - Q-Q Plots'
)
plt.show()

## 7. CDF Comparison Plots

In [None]:
def plot_cdf_comparison(data, fits, title):
    """
    Plot empirical CDF with overlaid theoretical CDFs.
    """
    fig, ax = plt.subplots(figsize=(14, 7))
    
    # Plot empirical CDF
    sorted_data = np.sort(data)
    y = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    ax.plot(sorted_data, y, 'k-', linewidth=2, label='Empirical CDF', alpha=0.7)
    
    # Generate x values for plotting CDFs
    x = np.linspace(data.min(), data.max(), 1000)
    
    # Plot theoretical CDFs
    colors = ['red', 'green', 'orange', 'purple']
    linestyles = ['-', '--', '-.', ':']
    
    for (dist_name, fit_info), color, linestyle in zip(fits.items(), colors, linestyles):
        cdf = fit_info['dist'].cdf(x)
        ax.plot(x, cdf, color=color, linestyle=linestyle, linewidth=2.5, 
                label=fit_info['name'])
    
    ax.set_xlabel('Time', fontsize=12, fontweight='bold')
    ax.set_ylabel('Cumulative Probability', fontsize=12, fontweight='bold')
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.legend(loc='best', fontsize=11)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    return fig

In [None]:
# Plot CDF comparison for interarrival times
fig_cdf_interarrival = plot_cdf_comparison(
    interarrival_times, 
    interarrival_fits, 
    'Interarrival Times - CDF Comparison'
)
plt.show()

In [None]:
# Plot CDF comparison for service times
fig_cdf_service = plot_cdf_comparison(
    service_times, 
    service_fits, 
    'Service Times - CDF Comparison'
)
plt.show()

## 8. Summary and Recommendations

In [None]:
# Create comprehensive summary
print("=" * 80)
print("DISTRIBUTION FITTING SUMMARY")
print("=" * 80)

print("\n" + "*" * 40)
print("INTERARRIVAL TIMES")
print("*" * 40)

best_interarrival_aic = interarrival_comparison_df.iloc[0]
print(f"\nBest Distribution (AIC): {best_interarrival_aic['Distribution']}")
print(f"  AIC: {best_interarrival_aic['AIC']:.4f}")
print(f"  BIC: {best_interarrival_aic['BIC']:.4f}")

best_interarrival_ks = interarrival_tests_df.loc[interarrival_tests_df['KS p-value'].idxmax()]
print(f"\nBest Distribution (KS Test): {best_interarrival_ks['Distribution']}")
print(f"  KS p-value: {best_interarrival_ks['KS p-value']:.4f}")

print("\n" + "*" * 40)
print("SERVICE TIMES")
print("*" * 40)

best_service_aic = service_comparison_df.iloc[0]
print(f"\nBest Distribution (AIC): {best_service_aic['Distribution']}")
print(f"  AIC: {best_service_aic['AIC']:.4f}")
print(f"  BIC: {best_service_aic['BIC']:.4f}")

best_service_ks = service_tests_df.loc[service_tests_df['KS p-value'].idxmax()]
print(f"\nBest Distribution (KS Test): {best_service_ks['Distribution']}")
print(f"  KS p-value: {best_service_ks['KS p-value']:.4f}")

print("\n" + "=" * 80)
print("RECOMMENDATIONS")
print("=" * 80)
print("""\n1. Choose the distribution with the lowest AIC/BIC values
2. Consider distributions with high p-values in KS and Chi-square tests (> 0.05)
3. Examine Q-Q plots for visual assessment of fit quality
4. For queue theory, exponential distribution is theoretically preferred if it fits well
5. If multiple distributions fit well, choose the simpler one (fewer parameters)
""")

## 9. Export Results

In [None]:
# Save comparison results to CSV
interarrival_comparison_df.to_csv('../results/interarrival_distribution_comparison.csv', index=False)
service_comparison_df.to_csv('../results/service_distribution_comparison.csv', index=False)

interarrival_tests_df.to_csv('../results/interarrival_goodness_of_fit.csv', index=False)
service_tests_df.to_csv('../results/service_goodness_of_fit.csv', index=False)

print("Results saved successfully!")
print("  - interarrival_distribution_comparison.csv")
print("  - service_distribution_comparison.csv")
print("  - interarrival_goodness_of_fit.csv")
print("  - service_goodness_of_fit.csv")

In [None]:
# Save figures
fig_interarrival.savefig('../figures/interarrival_pdf_comparison.png', dpi=300, bbox_inches='tight')
fig_service.savefig('../figures/service_pdf_comparison.png', dpi=300, bbox_inches='tight')
fig_qq_interarrival.savefig('../figures/interarrival_qq_plots.png', dpi=300, bbox_inches='tight')
fig_qq_service.savefig('../figures/service_qq_plots.png', dpi=300, bbox_inches='tight')
fig_cdf_interarrival.savefig('../figures/interarrival_cdf_comparison.png', dpi=300, bbox_inches='tight')
fig_cdf_service.savefig('../figures/service_cdf_comparison.png', dpi=300, bbox_inches='tight')

print("Figures saved successfully!")

## Conclusion

This notebook has performed comprehensive distribution fitting analysis including:

1. **Four Distribution Types**: Exponential, Gamma, Lognormal, and Weibull
2. **Model Selection**: AIC and BIC criteria for comparing models
3. **Statistical Tests**: Kolmogorov-Smirnov and Chi-square goodness-of-fit tests
4. **Visual Validation**: PDF overlays, Q-Q plots, and CDF comparisons

The results provide strong evidence for selecting the most appropriate distribution to model your queue system's interarrival and service times.