# Historical Forecasting
I want to validate a Monte Carlo Simulation. I have a year's worth of data from a team, counting collected stories. Knowing how many stories they complete by the end of the year, I want to estimate, each sprint, what the odds are that they will complete that number of stories by the end of the year. Basically, we're back-testing the Monte Carlo simulations.

In [None]:
import urllib.parse
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
stories_completed_each_sprint = [2,2,7,1,3,6,7,6,5,2,1,1,4,5,5,6,5,3,5,5,5,8,5,4,3,7,14,4]
num_simulations = 1000

In [None]:
total_sprints = len(stories_completed_each_sprint)
total_backlog = sum(stories_completed_each_sprint)
print(f"There are {total_sprints} total sprints.")
print(f"The backlog has {total_backlog} total stories.")

for sprint in range(total_sprints):
    stories_completed = stories_completed_each_sprint[:sprint+1]
    print(f"We just completed sprint {sprint+1} and have finished {sum(stories_completed)} total stories.")
    backlog_size = total_backlog - sum(stories_completed)
    
    ## Set up our Monte Carlo Simulation.
    # Zero out an array with a row for each iteration of the simulation.
    sprints_to_completion = np.zeros(num_simulations)
    
    ## Run our simulations
    for j in range(num_simulations):
        # Initialize the backlog for this iteration
        remaining_backlog = backlog_size
        sprint_count = 0
    
        while remaining_backlog > 0:
            # Randomly select the number of stories completed in a sprint
            sim_stories_completed = np.random.choice(stories_completed)
            
            # Remove those stories from the backlog.
            remaining_backlog -= sim_stories_completed
            sprint_count += 1
    
        sprints_to_completion[j] = sprint_count

    ## Analysis
    # Adjust sprints_to_completion by adding the sprint value
    adjusted_sprints = sprints_to_completion + sprint
    
    # Define bins: start from the minimum of adjusted_sprints, go up to the maximum plus 2
    # The '+2' ensures the last bin includes the maximum value
    non_empty_bins = np.arange(min(adjusted_sprints), max(adjusted_sprints) + 2)
    
    # Calculate histogram data
    hist, bins = np.histogram(adjusted_sprints, bins=non_empty_bins)
    
    # Calculate cumulative percentages
    cumulative_counts = np.cumsum(hist)
    total_counts = cumulative_counts[-1]
    cumulative_percentages = cumulative_counts / total_counts * 100
    
    # Plotting the histogram
    plt.figure(figsize=(10, 6))
    plt.hist(adjusted_sprints, bins=bins, alpha=0.7, color='blue', edgecolor='black', align='left')
    
    # Annotate with cumulative percentages
    for i in range(len(cumulative_percentages)):
        plt.text(bins[i], hist[i], f'{cumulative_percentages[i]:.1f}%', ha='center', va='bottom')
    
    plt.title('Distribution of Adjusted Sprints Required to Complete the Backlog with Cumulative Percentages')
    plt.xlabel('Sprint Number (we actually finish in sprint 28)')
    plt.ylabel('Frequency')
    plt.xticks(bins[:-1])  # Set x-axis ticks to align with the adjusted sprint numbers
    plt.show()

