In [None]:
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.linear_model import LinearRegression
import seaborn as sns

<h3>Prepare Raw Data</h3>

Note: data points through OpenAI Five were provided by OpenAI and have been redacted; data points on language models since then were provided by us

In [None]:
raw_data = [
            # 11 removed items
            ['10/11/2018',6.16,'BERT'],
            ['7/1/2019',49.3,'RoBERTa'],
            ['2/14/2019',31.25,'GPT-2'], #GPT-2 calculated from GPT-3 XL where 27.5 PF-days for 1.32B params converted to 31.25 PF-days for 1.5B params
            ['10/23/2019',382,'T5'],
            ['5/28/2020', 3.64E3, 'GPT-3'] #GPT-3 paper says 3.64E3 but Nvidia paper says almost 5k.
]

Convert the data so that we have datetime dates and base-2 logs of the compute amount. Also distinguish the OpenAI data from the language model data we have added since then end of that data.

In [None]:
data_converted = [[dt.datetime.strptime(val[0], '%m/%d/%Y'), np.log2(val[1])] for val in raw_data]
openai_data = data_converted[:-5]
notable_llms = data_converted[-5:]

Convert all the dates to a raw number representing the number of seconds since 6/23/1912 (an arbitrary date that is 100 years before AlexNet), then calculate separate linear regressions for the period pre- and post-AlexNet. Note that we only care about evaluating the future of the trend originally identified by OpenAI, so we do not include the language models released since their research in our regression lines. 

In [None]:
x = [val[0] for val in openai_data] 
y = [val[1] for val in openai_data] 

basetime = dt.datetime.strptime('6/23/1912','%m/%d/%Y')

X_early = [[(val-basetime).total_seconds()] for val in x[0:11]]
reg_early = LinearRegression().fit(X_early,y[0:11])
y_early_fit = reg_early.predict(X_early)

X_late = [[(val-basetime).total_seconds()] for val in x[11:]]
reg_late = LinearRegression().fit(X_late,y[11:])
y_late_fit = reg_late.predict(X_late)

Plot the data to make sure it looks right

In [None]:
plt.plot(x,y,'bo')
plt.plot(x[0:11],y_early_fit,'b-')
plt.plot(x[11::],y_late_fit,'b-')
plt.xlabel('Date')
plt.ylabel('Computing Required')
plt.title('History of Compute-Intensive Machine Learning')
plt.show()

<h3>Create Predictor Functions</h3>

We want to be able to predict the date at which a certain compute level will be reach, or the compute level that will be reached by a certain date, or the date at which a certain cost will be required, and so on. 

In [None]:
def predict_compute(date_string):
    # Redefine the basetime (same as before)
    basetime = dt.datetime.strptime('6/23/1912','%m/%d/%Y')
    
    # Convert the date string to a datetime object as done for the raw data
    eval_dt = dt.datetime.strptime(date_string,'%m/%d/%Y')

    # Convert the date into a single number representing seconds since the basetime
    diff = (eval_dt-basetime).total_seconds()

    # Calculate the predicted compute at that datetime
    log_compute = reg_late.predict([[diff]])

    # IMPORTANT: remember that this function always returns the base-2 log of the actual compute amount predicted for date_string
    return log_compute

In [None]:
def predict_date(compute):
    # Convert the desired compute level to its base-2 log
    log_compute = np.log2(compute)

    # Calculate the x-coordinate predicted by the post-AlexNet regression line for that level of compute
    diff = (log_compute - reg_late.intercept_)/reg_late.coef_

    # Convert the x-coordinate to a date and return
    date_predicted = basetime+dt.timedelta(seconds=diff[0])

    return date_predicted

For calculations involving the cost of these compute levels, we want to compare predictions under three scenarios: compute/dollar remaining fixed; compute/dollar doubling every 2 years (optimistic scenario); and compute/dollar doubling every 4 years (likely scenario). To do this, we need "offset" functions that can reflect changing prices for compute. The following function provides linear regressions for cost curves. It takes a date as an input, where the provided date represents the date at which compute/dollar will have doubled relative to January 1, 2021 (a somewhat arbitrary date representing the point at which we let these curves start diverging). It returns a regression that can be used to offset cost predictions based on changing price information.

In [None]:
def make_cost_curve(new_date):

    # Define our initial constants: a Google Cloud TPU v3 cost $8.00/hr on 1/1/2021 and had an advertised top speed of 420 teraFLOPs (i.e. 0.42 petaFLOPs)
    tpu3_pflops = .42
    tpu3_cost = 8
    cost_per_pflops_day = (1E15 * 3600 * 24 * tpu3_cost) / (tpu3_pflops * 1E15 * 3600)

    # Create two x-coordinates representing January 1, 2021 and our date at which we expect compute/dollar to have doubled
    x0, x1 = (dt.datetime.strptime('1/1/2021','%m/%d/%Y') - basetime).total_seconds(), (dt.datetime.strptime(new_date,'%m/%d/%Y') - basetime).total_seconds()

    # Create two y-coordinates representing the base-2 log of the original cost per petaFLOPS-day and the base two log of half that value
    y0, y1 = np.log2(cost_per_pflops_day), np.log2(cost_per_pflops_day / 2)

    # Return the regression line between these two points in LOG BASE 2 values
    return LinearRegression().fit([[x0], [x1]], [y0, y1])

# Create two specific cost curves representing our 4-year and 2-year doubling times
cost_curve_4 = make_cost_curve('1/1/2025')
cost_curve_2 = make_cost_curve('1/1/2023')

In [None]:
def predict_date_of_cost(cost, offset=None):
    # Redefine our constants 
    tpu3_pflops = .42
    tpu3_cost = 8
    cost_per_pflops_day = (1E15 * 3600 * 24 * tpu3_cost) / (tpu3_pflops * 1E15 * 3600)

    # Find the date at which this cost would be expended assuming a fixed price of compute
    date = predict_date(cost / cost_per_pflops_day)

    if offset is not None:
        """We are going to approximate the exact date by updating the expected cost at the date found above to reflect 
        a decline in prices between January 1, 2021 and that date. We will then recalculate a new expected date of 
        intersection, update our cost based on a decline in prices in that (shorter) interval, and so on, until the 
        difference between the cost of a petaFLOPs-day at our last predicted date and our current predicted date is <= $1.00."""

        cost_diff = np.inf
        while cost_diff > 1:

            # Taking 2 to the power of our offset.predict function at the new time gets us the cost per petaFLOPS-day that we expect as of that date
            new_cost = 2 ** offset.predict(np.array((date - basetime).total_seconds()).reshape(1, -1))[0]

            # We then set our new cost_per_pflops_day to this new_cost, and save the difference between the last cost_per_pflops_day and the current one to cost_diff
            cost_diff, cost_per_pflops_day = cost_per_pflops_day - new_cost, new_cost

            # We can now predict a new date based on the amount of compute that could be purchased by our sticker price at the new level of cost per petaFLOPS-day
            date = predict_date(cost / cost_per_pflops_day)

    return date

In [None]:
def predict_cost(date_string, offset=None):
    # Redefine our constants
    tpu3_pflops = .42
    tpu3_cost = 8
    cost_per_pflops_day = (1E15 * 3600 * 24 * tpu3_cost) / (tpu3_pflops * 1E15 * 3600)

    if offset is not None:
        # Calculate what the cost per petaFLOPS-day should be as of the date we have inputted
        date = dt.datetime.strptime(date_string, '%m/%d/%Y')
        cost_per_pflops_day = 2 ** offset.predict(np.array((date - basetime).total_seconds()).reshape(1, -1))[0]

    # Calculate the compute usage predicted as of the date we have inputted
    compute_level = 2 ** predict_compute(date_string)[0]

    # Return our predicted compute usage times the cost per petaFLOPS-day that we predict for our inputted date
    return compute_level * cost_per_pflops_day

We now use these functions to predict the date at which certain milestones of expenditures will be reached: \\$3.5 billion (cost of the National Ignition Facility), \\$13.5 billion (cost of the search for the Higgs Boson), \\$450 billion (2.2\% of U.S. GDP in 2019, representing the amount that was spent annually on the Apollo Project), and 
\\$21.43 trillion (representing U.S. GDP in 2019). For all predictions we compare the base assumption of no change in cost of copute to our "most likely" cost estimate of a doubling in compute/dollar every 4 years.

In [None]:
print('National Ignition Facility')
print(predict_date_of_cost(3.5E9))
print(predict_date_of_cost(3.5E9, offset=cost_curve_4), '\n')

print('Higgs Boson')
print(predict_date_of_cost(1.35E10))
print(predict_date_of_cost(1.35E10, offset=cost_curve_4), '\n')

print('Apollo Program')
print(predict_date_of_cost(4.5E11))
print(predict_date_of_cost(4.5E11, offset=cost_curve_4), '\n')

print('2019 U.S. GDP')
print(predict_date_of_cost(2.143E13))
print(predict_date_of_cost(2.143E13, offset=cost_curve_4))

<h3>Generate Figure 1</h3>

Will display the growth in compute demands before and after AlexNet

In [None]:
"""Note: although most of the calculations are done using log-2 (because we frequently
talk about doubling times in the text of the paper), we wanted this graph to be in 
log-10 (because orders of magnitude are visually easier to digest). So all the plots
contain np.log10(2 ** x) to convert any given value from log-2 to log-10."""

sns.set_style('white')
fig, (ax0, ax1) = plt.subplots(1, 2, constrained_layout=True, figsize=(10,4.25))
sns.despine()

c1 = "#003DA6"
c2 = "#853A6D"
c3 = "#7AC4A5"

# Plot all of the data points from OpenAI on the left-hand figure
ax0.scatter(x, [np.log10(2 ** i) for i in y], c=c1)

# Plot the regression lines matching to the pre- and post-AlexNet eras
ax0.plot(x[0:11], np.log10(2 ** y_early_fit), c=c1)
ax0.plot(x[11::], np.log10(2 ** y_late_fit), c=c1)

# Add labels 
ax0.set_xlabel('Year', fontsize=14)
ax0.set_ylabel('petaFLOPS-days ($10^y$)', fontsize=14)
ax0.set_title('Historical Trend', fontsize=16)


# Generate x and y coordinates for the large language models (last 5 items in our data_converted list)
x_llms = [i[0] for i in notable_llms]
y_llms = [np.log10(2 ** i[1]) for i in notable_llms]

# Extend the regression line through the time period spanned by these language models, using the raw date strings from our raw_data list
pred_y_llms = [np.log10(2 ** predict_compute(val[0])) for val in raw_data[-5:]]

# Plot all the post-AlexNet OpenAI data points as blue dots and a corresponding regression line in blue
ax1.scatter(x[11:], [np.log10(2 ** i) for i in y[11:]], c=c1, label='Historical Data\n[OpenAI Research]')
ax1.plot(x[11:], np.log10(2 ** y_late_fit), c=c1)

# Plot the more recent language models as green squares and an extension of the OpenAI regression line as a dashed green line
ax1.scatter(x_llms, y_llms, marker='s', c=c3, label='Recent Large\nLanguage Models')
ax1.plot([x_llms[0], x_llms[-1]], [pred_y_llms[0], pred_y_llms[-1]], c=c3, linestyle='--')

# Adjust labelling
ax1.set_xlabel('Year', fontsize=14)
ax1.legend(loc=4, fontsize=12)
ax1.set_title('Trend Since AlexNet', fontsize=16)

#plt.savefig('compute_trends.jpg', dpi=300, bbox_inches='tight')

plt.show()

<h3>Generate Figure 2</h3>

This figure will cover the time from January 1, 2022 to June 1, 2027. We do not want to show when costs will intersect with static, 2019-level GDP; rather, we want to figure out when they will intersect with a growing GDP. We assuming growth of 3% for U.S. GDP with error shading of 2% on the low end and 5% on the high end.

In [None]:
# Redefine our cost constants in global scope
tpu3_pflops = .42
tpu3_cost = 8
cost_per_pflops_day = (1E15 * 3600 * 24 * tpu3_cost) / (tpu3_pflops * 1E15 * 3600)

# Create 100 x coordinates for interpolation and convert them to seconds since our basetime in order to use the regressions we've previously defined
start = (dt.datetime.strptime('1/1/2022','%m/%d/%Y') - basetime).total_seconds()
end = (dt.datetime.strptime('6/1/2027','%m/%d/%Y') - basetime).total_seconds()
t = np.linspace(start, end, num=100).reshape(100, 1)

# Converted back to regular datetime objects for labelling purposes
dates_for_labels = [basetime+dt.timedelta(seconds=val[0]) for val in t]

# Calculate the compute levels predicted by the OpenAI regression for each of these 100 points in time
compute = 2 ** reg_late.predict(t)

In [None]:
# Create a function to calculate the GDP at any given point in time
def calc_gdp(date_in_seconds_since_basetime, growth_rate):

    # Define a constant for the U.S. GDP in 2019
    us_gdp_2019 = 2.143E13

    # Convert the inputted time (which corresponds to a value from our t array) to a regular datetime object
    time = basetime + dt.timedelta(seconds=date_in_seconds_since_basetime)

    # Convert that datetime object to a number representing seconds since 2019 (since this is when our GDP value diverges)
    secs_since_2019 = (time - dt.datetime.strptime('1/1/2020', '%m/%d/%Y')).total_seconds()

    # Convert the seconds since 2019 value into a years since 2019 value
    years_since_2019 = secs_since_2019 / (3600 * 24 * 365)

    # Use the property that GDP_t = GDP_0 * e ^ (rt) ==> ln(GDP_t) = ln(GDP_0) + rt to find ln(GDP_t) 
    return np.log(us_gdp_2019) + growth_rate * years_since_2019

Next we need a function to figure out when our line will actually cross the GDP level depending on different pricing assumptions. We do this in a naive way: create an array of 5,000 points in time across our range of interest, then for each point, calculate the expected GDP and the extrapolated total cost for training an AI model. Then simply return the first point at which the latter value is greater than the former. 

In [None]:
def find_date_of_crossing(offset=False):
    # Note that our time values are significantly more granular than is necessary to later plot the function
    t = np.linspace(start, end, num=5000).reshape(5000, 1)

    # Calculate expected compute utilization at each point in time
    compute = 2 ** reg_late.predict(t)

    # Create an array for the cost of a petaFLOPS-day of compute at each point in time
    if offset==False:
        cost_array = np.array([cost_per_pflops_day] * 5000)
    else:
        cost_array = 2 ** offset.predict(t)

    # Create an array representing the projected GDP for each point in time
    gdp_projections = np.array([np.e ** calc_gdp(t[i][0], 0.03) for i in range(len(t))])

    # Create an array representing whether or not the cost of the AI model is greater than the projected GDP for each point in time
    t_masked = compute * cost_array > gdp_projections

    # Return the first point in time where this condition evaluates to true, convert it to a datetime object, and return it
    return basetime + dt.timedelta(seconds = t[np.where(t_masked==True)[0][0]][0])

# Check the expected point at which we cross U.S. GDP under each of our three pricing scenarios
print(find_date_of_crossing())
print(find_date_of_crossing(cost_curve_4))
print(find_date_of_crossing(cost_curve_2))

Our figure itself involves lots of annotations and repetitive lines, so we will use a few custom helper functions. For our linear-scale graph, we want to display the y-axis in "trillions of dollars," so we will divide all of our compute costs and GDP projects by 10 ^ 12. 

In [None]:
def plot_lines(ax, linear=False):
    # Create arrays representing compute prices under each of our three pricing scenarios
    # Recall that our t array as defined in global scope contains 100 unique points in time
    cost_array_fixed, cost_array_4, cost_array_2 = np.array([cost_per_pflops_day] * 100), 2 ** cost_curve_4.predict(t), 2 ** cost_curve_2.predict(t)

    # Create arrays representing total compute costs, using our array of expecte compute levels defined previously in global scope
    compute_upper, compute_mid, compute_lower = compute * cost_array_fixed, compute * cost_array_4, compute * cost_array_2

    # Create arrays representing projected GDP levels for each of our three scenarios (2% growth, 3% growth, 5% growth)
    gdp_upper, gdp_mid, gdp_lower = np.array([np.e ** calc_gdp(i[0], 0.05) for i in t]), np.array([np.e ** calc_gdp(i[0], 0.03) for i in t]), np.array([np.e ** calc_gdp(i[0], 0.02) for i in t])

    # For the graph with a linear-scale axis, divide values by one trillion
    if linear:
        compute_upper, compute_mid, compute_lower = compute_upper / 1E12, compute_mid / 1E12, compute_lower / 1E12
        gdp_upper, gdp_mid, gdp_lower = gdp_upper / 1E12, gdp_mid / 1E12, gdp_lower / 1E12

    # Plot the central compute demand line with a solid line and add error shading
    ax.plot(dates_for_labels, compute_mid, c=c1, linestyle='-', label='Predicted Cost of\nLargest AI Model')
    ax.fill_between(dates_for_labels, compute_upper, compute_lower, color=c1, alpha=0.1)

    # Plot the central GDP line with a solid line and add error shading
    ax.plot(dates_for_labels, gdp_mid, linestyle=':', color=c2, label='Predicted U.S. GDP')
    ax.fill_between(dates_for_labels, gdp_upper, gdp_lower, color=c2, alpha=0.1)

    # Adjust legends
    ax.set_xlabel('Year', fontsize=14)
    ax.legend(loc=2, edgecolor='white')

In [None]:
def annotate_cost(ax, text, cost, xycoords, linear=False):
    # Choose the appropriate y coordinate depending on whether our graph is using a linear scale or not
    cost_to_print = cost if linear == False else cost / 1E12

    # Annotate the graph using the text provided, with an error pointing to the point defined by (x = the date 
    # at which we expect to hit a given cost, y = the cost itself) 
    ax.annotate(
      text=text,
      xy=(predict_date_of_cost(cost, offset=cost_curve_4), cost_to_print),
      xytext=xycoords,
      textcoords='axes fraction',
      ha='center',
      arrowprops=dict(arrowstyle='wedge', color='black', lw=1)
    )

Now we finally make the figure!

In [None]:
sns.set_style('white')
fig, (ax0, ax1) = plt.subplots(2, 1, constrained_layout=True, figsize=(8, 9))
sns.despine()

# Plot and adjust parameters for the top figure
plot_lines(ax0)
ax0.set_ylabel('Dollars (Log Scale)', fontsize=14)
ax0.set_yscale('log')
ax0.set_ylim(1E8, 2E15)

# Add annotations
annotate_cost(ax0, 'Cost of the NIF', 3.5E9, (0.15, 0.4))
annotate_cost(ax0, 'Cost of the Higgs\nBoson Search', 1.35E10, (0.6, 0.2))
annotate_cost(ax0, '2.2% of GDP (Annual Cost\nof Apollo Project)', 5.2E11, (0.4, 0.6))

# For the GDP annotation, use the specific date found using our find_date_of_crossing function, above
ax0.annotate(
    'Total U.S. GDP',
    xy=(dt.datetime.strptime('11/15/2026', '%m/%d/%Y'), predict_cost('11/15/2026', offset=cost_curve_4)),
    xytext=(0.65, 0.85),
    textcoords='axes fraction',
    ha='center',
    arrowprops=dict(arrowstyle='wedge', color='black', lw=1)
)


# Plot and adjust parameters for the bottom figure
plot_lines(ax1, linear=True)
ax1.set_ylabel('Trillions of Dollars (Linear Scale)', fontsize=14)
ax1.set_ylim(0, 45)

# Add annotations
annotate_cost(ax1, 'Cost of the NIF', 3.5E9, (0.15, 0.1), linear=True)
annotate_cost(ax1, 'Cost of the Higgs\nBoson Search', 1.35E10, (0.3, 0.2), linear=True)
annotate_cost(ax1, '2.2% of GDP (Annual Cost\nof Apollo Project)', 5.2E11, (0.5, 0.35), linear=True)

# For the GDP annotation, use the specific date found using our find_date_of_crossing function, above
ax1.annotate(
    'Total U.S. GDP',
    xy=(dt.datetime.strptime('11/15/2026', '%m/%d/%Y'), predict_cost('11/15/2026', offset=cost_curve_4) / 1E12),
    xytext=(0.65, 0.7),
    textcoords='axes fraction',
    ha='center',
    arrowprops=dict(arrowstyle='wedge', color='black', lw=1)
)


#plt.savefig('gdp_projections.jpg', dpi=300, bbox_inches='tight')

plt.show()

<h3>Analyze Use of Available GPUs</h3>

In this section we want to ask at what point a model predicted by this trendline would require the use of all accelerators in all cloud datacenters to train continuously for the accelrators' full productive lives. 

It is hard to find hard numbers for production volumes of accelerators capable of training large models. 

Apparently Intel says 7% of the 12 million server units worldwide are for deep learning. (https://www.businesswire.com/news/home/20210819005361/en/Global-Data-Center-Accelerator-Market-Forecast-to-2026-Artificial-Intelligence-to-Drive-the-Growth-of-Cloud-Data-Center-Market---ResearchAndMarkets.com)


That seems low given that total quarterly GPU production was 123 million units per year with Nvidia supplying 15.23%. (https://www.tomshardware.com/news/jpr-gpu-q2-vendor-share) They're pulling 25%-37% of revenue from servers (https://www.digitimes.com/news/a20200630PD213.html) and GPUs accounted for 85% of the accelerator market (https://www.mynewsdesk.com/brandessence/pressreleases/data-center-accelerator-market-size-2021-cagr-38-dot-7-percent-3112488) or 80.6% of global AI data center processor revenue (https://omdia.tech.informa.com/pr/2021-aug/nvidia-maintains-dominant-position-in-2020-market-for-ai-processors-for-cloud-and-data-center) (Xilinx next with FPGAs then Google TPUs)


All discrete GPUs from all vendors (at least Intel, AMD, and Nvidia) were 22M in Q1 2021. (https://www.tomshardware.com/news/mercury-research-gpu-report-q1-2021)

In [None]:
quarterly_production = (123E6) * 0.1523 * 0.37 / 0.806 # Calculating from server fraction of all Nvidia GPUs
print(4 * quarterly_production) # How many accelerators total per year?

In [None]:
useful_life = 3 #years
useful_gpu_count = quarterly_production * 4 * useful_life  # How many total accelerators are available to train on at a given time?

compute_per_gpu = 0.163  # A rough number for petaflops. (https://arxiv.org/abs/2104.04473)

So, we estimate that there are useful_gpu_count number of GPUs available to train on at any given time, and for simplicity, we assume they all have the same throughput of 0.163 petaFLOPs per second (which is probably much higher than the actual average value). How many computations can be performed over a three-year period with these resources?

In [None]:
compute_total = (compute_per_gpu * 3600 * 24 * 365 * useful_life) * useful_gpu_count

# But now we need to convert this to petaFLOPS-days, not petaFLOPS
compute_total = compute_total / (3600 * 24)
compute_total

In [None]:
predict_date(compute_total)

So our calculations predict that we will have to use every GPU in all cloud datacenters by the very end of 2025 in order to keep up with the trendline. For safety, let's check which dates would be consistent with being off by an order of magnitude in either direction for our estimate of total accelerators.

In [None]:
lower_bound_annual_production = predict_date(compute_total / 10)
print(lower_bound_annual_production)

upper_bound_annual_production = predict_date(compute_total * 10)
print(upper_bound_annual_production)

<h3>Generate Figure 3</h3>

This figure uses some custom data from a collection of sources.

In [None]:
# Define data from GPT-3 paper
gpt3_params = np.array([125E6, 356E6, 760E6, 1320E6, 2650E6, 6660E6, 12850E6, 174600E6])
gpt3_compute = np.array([2.6, 7.42, 15.8, 27.5, 55.2, 139, 268, 3640])

# Define data from Nvidia 1-trillion-param estimate (Nvidia says a 1T parameter model would take 84 days running on 3072 GPUs that each turn out 0.163 PF/s. https://arxiv.org/abs/2104.04473)
nvidia_params = 1E12
nvidia_compute = 84 * 3072 * 0.163

# Define the scaling law for the relationship between params and compute as give by OpenAI's Scaling Laws paper
def find_compute(n_params):
    return (n_params / (1.3E9)) ** (1 / 0.73)

# Define the end points of our line illustrating this scaling law
params = np.array([1E9, 1E15])
compute = find_compute(params)

In [None]:
sns.set_style('white')
fig, ax = plt.subplots(figsize=(10, 5))
sns.despine()

# Plot the line representing the scaling law equation in log base 10
ax.plot(np.log10(params), np.log10(compute), c=c1, label='Predicted Minimum\nCompute', zorder=1)

# Plot the GPT-3 data points as green squares
ax.scatter(np.log10(gpt3_params), np.log10(gpt3_compute), marker='s', c=c3, label='Actual GPT-3 Compute')

# Plot the Nvidia estimate for 1 trillion parameters
ax.scatter(np.log10(nvidia_params), np.log10(nvidia_compute), marker='D', color="#853A6D", label='Projected Compute Levels')

# Plot two more custom points representing "GPT-4" (17.5 trillion parameters) and 100 trillion parameters on the minimum compute line
ax.scatter([np.log10(1.75E13), np.log10(1E14)], [np.log10(find_compute(1.75E13)), np.log10(find_compute(1E14))], marker='D', color='#853A6D', zorder=2)

# Add annotations with custom offsets for aesthetic appearance
ax.annotate(
    'Hypothetical GPT-4\n(17.5 Trillion Parameters)',
    xy=(np.log10(1.75E13) + 0.05, np.log10(find_compute(1.75E13)) - 0.3),
    xytext=(0.8, 0.45),
    textcoords='axes fraction',
    ha='center',
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)
ax.annotate(
    '100 Trillion\nParameters',
    xy=(np.log10(1E14) + 0.08, np.log10(find_compute(1E14)) - 0.25),
    xytext=(0.9, 0.65),
    textcoords='axes fraction',
    ha='center',
    arrowprops=dict(arrowstyle='->', color='black', lw=1)
)
ax.annotate(
    'Nvidia Estimate for\n1 Trillion Parameters',
    xy=(np.log10(1E12) - 0.1, np.log10(nvidia_compute)),
    xytext=(0.5, 0.8),
    textcoords='axes fraction',
    ha='center',
    arrowprops=dict(arrowstyle='-[', color='black', lw=1, connectionstyle='angle3')
)

# Adjust labelling and aesthetics
ax.legend(fontsize=12, edgecolor='white')
ax.set_xlabel('Number of Parameters ($10^x$)', fontsize=14)
ax.set_ylabel('petaFLOPS-days ($10^y$)', fontsize=14)
ax.set_xlim(7.5, 15)
ax.set_ylim(0, 8)

#plt.savefig('compute_needs.jpg', dpi=300, bbox_inches='tight')

plt.show()