In [1]:
from datetime import datetime
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from researcher_impact.plotting import save_plot

In [2]:
pio.templates.default = "plotly_white"

In [3]:
result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

# Largest training runs for leading companies

In [4]:
company_aliases = {
    "Google": "Google",
    "Google Research": "Google",
    "Google Brain": "Google",
    "Google Inc.": "Google",
    "Google AI, Brain team": "Google",
    "Google Research, Brain Team": "Google",
    "Google AI": "Google",
    "Google Brain,Google Research": "Google",
    "Google Inc": "Google",
    "DeepMind": "DeepMind",
    "Google DeepMind": "DeepMind",
    "Meta AI": "Meta",
    "MetaAI": "Meta",
    "Facebook AI Research": "Meta",
    "Facebook AI research": "Meta",
    "Facebook": "Meta",
    "Facebook AI": "Meta",
    "OpenAI": "OpenAI",
    "Open AI": "OpenAI",
    "Microsoft Research": "Microsoft",
    "Microsoft": "Microsoft",
    "Microsoft Research,Peking University": "Microsoft",
    "Microsoft Bing": "Microsoft",
    "Alibaba Group": "Alibaba",
    "NVIDIA": "NVIDIA",
    "Nvidia": "NVIDIA",
    "Baidu Research- Silicon Valley AI Lab": "Baidu",
    "Baidu": "Baidu",
    "Amazon": "Amazon",
}

- Sort by publication date
- Filter companies of interest + rename companies to consistent alias => new DataFrame
- Get the maximum envelope of the data => new DataFrame
- Add on the last maximum to the current date, so there's a horizontal line continuing until the current date

In [5]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
df = pd.read_csv(data_url + 'ALL%20ML%20SYSTEMS')

In [6]:
cols_of_interest = ["System", "Organization", "Publication date", "Training compute (FLOP)"]

In [7]:
df = df.dropna(subset=["Organization", "Training compute (FLOP)"])

In [8]:
# Rejected samples
# We don't think the full training run was actually done
df.drop(df[df["System"] == "Megatron-LM (1T)"].index, inplace=True)
# Kingma was affiliated with OpenAI on the paper for this system, but only in the 2017 version rather than the original 2014 version
df.drop(df[df["System"] == "ADAM (CIFAR-10)"].index, inplace=True)


In [9]:
df['Publication date'] = pd.to_datetime(df['Publication date'])  # Ensure date column is in datetime format

In [10]:
df.sort_values('Publication date', inplace=True)
df.reset_index(inplace=True)

In [11]:
df[cols_of_interest]

Unnamed: 0,System,Organization,Publication date,Training compute (FLOP)
0,Theseus,Bell Laboratories,1950-07-02,4.000000e+01
1,Perceptron Mark I,"Cornell Aeronautical Laboratory,Cornell Univer...",1957-01-01,6.950000e+05
2,Pandemonium (morse),Massachusetts Institute of Technology,1959-02-01,6.000000e+08
3,Samuel Neural Checkers,IBM,1959-07-01,4.280000e+08
4,ADALINE,Stanford University,1960-06-30,9.900000e+03
...,...,...,...,...
176,BLOOM,"Hugging Face,BigScience",2022-11-08,1.800000e+23
177,AR-LDM,"Alibaba,University of Waterloo,Vector Institute",2022-11-20,5.100000e+20
178,LLaMA (65B),Meta AI,2023-02-24,5.500000e+23
179,GPT-4,OpenAI,2023-03-15,2.100000e+25


## Filter companies of interest + rename companies to consistent alias

In [12]:
rows = []
for i, row in df.iterrows():
    orgs = row["Organization"]
    print(orgs)
    org_list = [org.strip() for org in orgs.split(",")]
    for org in org_list:
        if org in company_aliases.keys():
            alias = company_aliases[org]
            new_row = row.copy()
            new_row["Organization"] = alias
            rows.append(new_row)

company_df = pd.DataFrame(rows)
company_df[cols_of_interest]

Bell Laboratories
Cornell Aeronautical Laboratory,Cornell University
Massachusetts Institute of Technology
IBM
Stanford University
NHK Broadcasting Science Research Laboratories
University of California
Princeton University
Stanford, CalTech
AT&T Bell Laboratories
Carnegie Mellon University 
IBM
Indian Statistical Institute
Carnegie Mellon University
The Technical University of Munich
National Chiao Tung University
AT&T Labs
Mitsubishi Electric Research Labs and Compaq CRL
Université de Montréal
IDSIA and TU Munich
Stanford
IDSIA ; University of Lugano & SUPSI
University of Montreal
Brno University of Technology, Johns Hopkins University
Brno University of Technology,Johns Hopkins University
IDSIA
University of Toronto
University of Toronto
University of Toronto
Google
University of Toronto
Universidad Nacional de Cordoba,Xerox Research Centre Europe,Inteligent Systems Lab Amsterdam,University of Amsterdam,LEAR Team,INRIA Grenoble
IDSIA
Google
NYU
CNRS,Google
DeepMind
Univeristy of Ams

Unnamed: 0,System,Organization,Publication date,Training compute (FLOP)
29,Unsupervised High-level Feature Learner,Google,2012-07-12,6.000000e+17
33,Word2Vec (large),Google,2013-10-16,3.890000e+16
35,TransE,Google,2013-12-05,1.340000e+18
36,DQN,DeepMind,2013-12-19,2.300000e+15
39,SPPNet,Microsoft,2014-06-18,3.410000e+18
...,...,...,...,...
172,AlexaTM 20B,Amazon,2022-08-02,2.040000e+23
174,Whisper,OpenAI,2022-09-21,4.650000e+22
178,LLaMA (65B),Meta,2023-02-24,5.500000e+23
179,GPT-4,OpenAI,2023-03-15,2.100000e+25


In [13]:
company_max_compute_df = company_df.copy()

In [14]:
company_max_compute_df["Training compute (FLOP)"] = company_df.groupby("Organization")['Training compute (FLOP)'].cummax()
company_max_compute_df[cols_of_interest]

Unnamed: 0,System,Organization,Publication date,Training compute (FLOP)
29,Unsupervised High-level Feature Learner,Google,2012-07-12,6.000000e+17
33,Word2Vec (large),Google,2013-10-16,6.000000e+17
35,TransE,Google,2013-12-05,1.340000e+18
36,DQN,DeepMind,2013-12-19,2.300000e+15
39,SPPNet,Microsoft,2014-06-18,3.410000e+18
...,...,...,...,...
172,AlexaTM 20B,Amazon,2022-08-02,2.040000e+23
174,Whisper,OpenAI,2022-09-21,3.140000e+23
178,LLaMA (65B),Meta,2023-02-24,5.500000e+23
179,GPT-4,OpenAI,2023-03-15,2.100000e+25


In [15]:
rows_to_add = []
for org, group_data in company_max_compute_df.groupby("Organization"):
    print(org, group_data["Training compute (FLOP)"].max())
    current_date = datetime.now().date()  # Get the current date
    compute = group_data["Training compute (FLOP)"].max()
    # Create a new row with NaN for all columns except "Organization", "Publication date", and "Training compute (FLOP)"
    new_row = pd.Series(
        {
            "Organization": org,
            "Publication date": current_date,
            "Training compute (FLOP)": compute,
        }
    )
    rows_to_add.append(new_row)

# Create a DataFrame from the rows to be added
new_data = pd.DataFrame(rows_to_add)

# Concatenate the new_data DataFrame to the original filtered_df
company_max_compute_df = pd.concat([company_max_compute_df, new_data], ignore_index=True)

Alibaba 3.6e+22
Amazon 2.04e+23
Baidu 3.14e+23
DeepMind 6.31e+23
Google 7.34e+24
Meta 5.5e+23
Microsoft 1.17e+24
NVIDIA 1.17e+24
OpenAI 2.1e+25


In [16]:
company_max_compute_df

Unnamed: 0,index,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,376.0,Unsupervised High-level Feature Learner,Vision,Image classification,Google,Industry,"Quoc V. Le, Marc'Aurelio Ranzato, Rajat Monga,...",2012-07-12 00:00:00,Building High-level Features Using Large Scale...,https://arxiv.org/pdf/1112.6209.pdf,...,"""We train this network using model parallelism...",,Unsupervised,,Hardware not reported,,Industry,Likely,We consider the problem of building high-level...,2023-06-15 15:50:10
1,362.0,Word2Vec (large),Language,Semantic embedding,Google,Industry,"T Mikolov, I Sutskever, K Chen, GS Corrado",2013-10-16 00:00:00,Distributed Representations of Words and Phras...,https://arxiv.org/abs/1310.4546,...,,,,0.55,,,Industry,,,2023-06-14 14:54:39
2,358.0,TransE,Other,Entity embedding,Google,Industry - Academia Collaboration,"Antoine Bordes, Nicolas Usunier, Alberto Garci...",2013-12-05 00:00:00,Translating Embeddings for Modeling Multi- rel...,https://papers.nips.cc/paper/2013/hash/1cecc7a...,...,,,,17.58,,,Industry,,,2023-08-03 20:32:27
3,355.0,DQN,Games,Atari,DeepMind,Industry,"V Mnih, K Kavukcuoglu, D Silver, A Graves",2013-12-19 00:00:00,Playing Atari with Deep Reinforcement Learning,https://arxiv.org/abs/1312.5602,...,,,,0.04,,,Industry,,,2023-06-14 14:57:17
4,344.0,SPPNet,Vision,Image classification,Microsoft,Industry - Academia Collaboration,"Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun",2014-06-18 00:00:00,Spatial Pyramid Pooling in Deep Convolutional ...,https://arxiv.org/abs/1406.4729,...,"""All networks in this paper can be trained on ...",NVIDIA GeForce GTX TITAN,,65.07,,,Industry,,,2023-08-01 10:30:07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,,,,,Google,,,2023-08-17,,,...,,,,,,,,,,
111,,,,,Meta,,,2023-08-17,,,...,,,,,,,,,,
112,,,,,Microsoft,,,2023-08-17,,,...,,,,,,,,,,
113,,,,,NVIDIA,,,2023-08-17,,,...,,,,,,,,,,


In [17]:
company_max_compute_df[cols_of_interest]

Unnamed: 0,System,Organization,Publication date,Training compute (FLOP)
0,Unsupervised High-level Feature Learner,Google,2012-07-12 00:00:00,6.000000e+17
1,Word2Vec (large),Google,2013-10-16 00:00:00,6.000000e+17
2,TransE,Google,2013-12-05 00:00:00,1.340000e+18
3,DQN,DeepMind,2013-12-19 00:00:00,2.300000e+15
4,SPPNet,Microsoft,2014-06-18 00:00:00,3.410000e+18
...,...,...,...,...
110,,Google,2023-08-17,7.340000e+24
111,,Meta,2023-08-17,5.500000e+23
112,,Microsoft,2023-08-17,1.170000e+24
113,,NVIDIA,2023-08-17,1.170000e+24


In [18]:
current_max = 0
current_max_org = "Google"
global_max_rows = []
for i, row in company_max_compute_df.iterrows():
    if row["Training compute (FLOP)"] > current_max:
        current_max = row["Training compute (FLOP)"]
        if row["Organization"] != current_max_org:
            current_max_org = row["Organization"]
            global_max_rows.append(row)
global_max_df = pd.DataFrame(global_max_rows)
global_max_df[cols_of_interest]

Unnamed: 0,System,Organization,Publication date,Training compute (FLOP)
4,SPPNet,Microsoft,2014-06-18,3.41e+18
5,Seq2Seq LSTM,Google,2014-09-10,5.6e+19
8,AlphaGo Fan,DeepMind,2015-10-01,3.8e+20
12,GNMT,Google,2016-09-26,6.9e+21
15,AlphaGo Master,DeepMind,2017-01-01,1.5e+23
83,Megatron-Turing NLG 530B,Microsoft,2021-10-11,1.17e+24
92,PaLM (540B),Google,2022-04-04,2.53e+24
104,GPT-4,OpenAI,2023-03-15,2.1e+25


In [20]:
# Create a line plot
fig = px.line(
    company_max_compute_df[company_max_compute_df["Organization"].isin(["Google", "DeepMind", "OpenAI", "Meta", "Microsoft"])],
    x="Publication date", 
    y="Training compute (FLOP)", 
    line_shape="hv",
    color="Organization",
    labels={"Training compute (FLOP)": "Largest published training run to date (FLOP)"},
    category_orders={"Organization": ["OpenAI", "Google", "Microsoft", "DeepMind", "Meta"]},
    hover_data=["System"],
)
fig.add_trace(
    go.Scatter(
        x=global_max_df["Publication date"],
        y=global_max_df["Training compute (FLOP)"],
        text=global_max_df["System"],
        textposition="top left",
        line={"color": "black"},
        mode="markers+text",
        name="Changes in leader",
    )
)
fig.add_trace(
    go.Scatter(
        x=pd.to_datetime(["2020-05-28"]),
        y=[3.14e23],
        text=["GPT-3 (175B)"],
        textposition="top left",
        line={"color": "black"},
        mode="markers+text",
        marker_symbol="x",
        name="Other notable systems",
    )
)
# fig.add_trace(
#     go.Scatter(
#         x=pd.to_datetime(["2020-05-28", "2023-02-24"]),
#         y=[3.14e23, 5.50e23],
#         text=["GPT-3 (175B)", "LLaMA (65B)"],
#         textposition=["top left", "middle left"],
#         line={"color": "black"},
#         mode="markers+text",
#         marker_symbol="x",
#         name="Other notable systems",
#     )
# )

# Convert year to datetime
fig.update_yaxes(type="log")
# Show all years on x axis
fig.update_xaxes(nticks=12)
# Make sure the labels are fully shown
fig.update_xaxes(range=[pd.to_datetime(2011, format="%Y"), datetime.now().date()])
fig.update_layout(
    legend=dict(
        title="",
        orientation="h",
        # yanchor="top",
        y=-0.15,
        # xanchor="center",
        x=0,
    ),
)
# Edit figure layout
fig.update_layout(
    autosize=False,
    width=400,
    height=415,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=0),
)

save_plot(fig, result_file_location, 'companies_largest_compute_all')

fig.show()

# Largest training runs for any company

In [37]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
df = pd.read_csv(data_url + 'ALL%20ML%20SYSTEMS')

In [38]:
df = df.dropna(subset=["Training compute (FLOP)"])

In [39]:
# Rejected samples
# We don't think the full training run was actually done
df.drop(df[df["System"] == "Megatron-LM (1T)"].index, inplace=True)

In [40]:
df['Publication date'] = pd.to_datetime(df['Publication date'])  # Ensure date column is in datetime format

In [41]:
df.sort_values('Publication date', inplace=True)

In [42]:
df.reset_index(inplace=True)

In [43]:
df

Unnamed: 0,index,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,557,Theseus,Other,Maze solving,Bell Laboratories,Industry,Claude Shannon,1950-07-02,Mighty Mouse,https://www.technologyreview.com/2018/12/19/13...,...,,,,,,,Industry,,,2023-05-29 20:51:04
1,551,Perceptron Mark I,Vision,Binary classification,"Cornell Aeronautical Laboratory,Cornell Univer...",Industry,F Rosenblatt,1957-01-01,The Perceptron—a perceiving and recognizing au...,https://blogs.umass.edu/brain-wars/files/2016/...,...,,,,,,,Industry,,,2023-08-15 18:01:22
2,550,Pandemonium (morse),Other,Morse translation,Massachusetts Institute of Technology,Academia,OG Selfridge,1959-02-01,Pandemonium: A Paradigm for Learning,https://aitopics.org/doc/classics:504E1BAC/,...,,,,,,,Academia,Speculative,,2023-07-25 18:00:25
3,549,Samuel Neural Checkers,Games,Checkers,IBM,Industry,Arthur L. Samuel,1959-07-01,Some studies in machine learning using the gam...,https://ieeexplore.ieee.org/abstract/document/...,...,,,,,,,Industry,,,2023-05-29 20:51:04
4,546,ADALINE,Vision,Pattern recognition,Stanford University,Academia,Widrow and Hoff,1960-06-30,Adaptive switching circuits,https://isl.stanford.edu/~widrow/papers/c1960a...,...,,,,,,,Academia,,,2023-05-29 20:51:04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,17,BLOOM,Language,Language model,"Hugging Face,BigScience",Research Collective,"Margaret Mitchell, Giada Pistilli, Yacine Jern...",2022-11-08,BigScience Large Open-science Open-access Mult...,https://huggingface.co/bigscience/bloom,...,,,Self-supervised learning,,,Yes,,,,2023-08-04 13:13:07
180,14,AR-LDM,Multimodal,Text-to-image,"Alibaba,University of Waterloo,Vector Institute",Industry - Academia Collaboration (Industry le...,"Xichen Pan, Pengda Qin, Yuhong Li, Hui Xue, We...",2022-11-20,Synthesizing Coherent Story with Auto-Regressi...,https://arxiv.org/abs/2211.10950,...,8 NVIDIA A100 GPUs for 8 days,NVIDIA A100,,,,,,Confident,Conditioned diffusion models have demonstrated...,2023-08-01 09:57:26
181,10,LLaMA (65B),Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Thibaut Lavril, Gautier Izacard,...",2023-02-24,LLaMA: Open and Efficient Foundation Language ...,https://arxiv.org/abs/2302.13971,...,"""When training a 65B-parameter model, our code...",NVIDIA A100,Supervised,1179384.75,1023384 processor-hours on A100 GPUs. May 2023...,,Industry,Likely,"We introduce LLaMA, a collection of foundation...",2023-07-28 16:26:34
182,9,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,https://arxiv.org/abs/2303.08774,...,,,Self-supervised learning,,,Yes,,,,2023-08-06 22:13:42


In [44]:
# Filter rows with the top 20 largest values for 'Training compute (FLOP)" column since 2021
top_20_compute_df = df[df['Publication date'] >= pd.to_datetime('2021-01-01')].nlargest(20, 'Training compute (FLOP)')

In [45]:
top_20_compute_df.reset_index(inplace=True)
top_20_compute_df

Unnamed: 0,level_0,index,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,182,9,GPT-4,Multimodal,Language modelling,OpenAI,Industry,OpenAI,2023-03-15,GPT-4 Technical Report,...,,,Self-supervised learning,,,Yes,,,,2023-08-06 22:13:42
1,183,8,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-08-10 15:21:27
2,173,26,Minerva (540B),Language,Quantitative Reasoning Problems,Google,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,...,,,Self-supervised learning,3267257.75,,Yes,Industry,,Language models have achieved remarkable perfo...,2023-08-10 15:22:32
3,163,43,PaLM (540B),Language,Language modelling,Google Research,Industry,"Aakanksha Chowdhery, Sharan Narang, Jacob Devl...",2022-04-04,PaLM: Scaling Language Modeling with Pathways,...,,,Self-supervised learning,3232806.53,,Yes,Industry,,Large language models have been shown to achie...,2023-08-11 19:08:06
4,150,73,Megatron-Turing NLG 530B,Language,,"Microsoft,NVIDIA",Industry,"Ali Alvi, Paresh Kharya",2021-10-11,Using DeepSpeed and Megatron to Train Megatron...,...,,,Self-supervised learning,3046994.09,,Yes,Industry,,Pretrained general-purpose language models can...,2023-08-15 14:40:57
5,156,62,Gopher,Language,Language modelling,DeepMind,Industry,"Jack W. Rae, Sebastian Borgeaud, Trevor Cai, K...",2021-12-08,"Scaling Language Models: Methods, Analysis & I...",...,,,,891638.8,,Yes,Industry,,We enhance auto-regressive language models by ...,2023-05-29 20:51:04
6,162,44,Chinchilla,Language,Language modelling,DeepMind,Industry,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me...",2022-03-29,Training Compute-Optimal Large Language Models,...,,,,753491.58,,Yes,Industry,,We investigate the optimal model size and numb...,2023-05-29 20:51:04
7,181,10,LLaMA (65B),Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Thibaut Lavril, Gautier Izacard,...",2023-02-24,LLaMA: Open and Efficient Foundation Language ...,...,"""When training a 65B-parameter model, our code...",NVIDIA A100,Supervised,1179384.75,1023384 processor-hours on A100 GPUs. May 2023...,,Industry,Likely,"We introduce LLaMA, a collection of foundation...",2023-07-28 16:26:34
8,166,38,OPT-175B,Language,Language modelling,Meta AI,Industry,"Susan Zhang, Stephen Roller, Naman Goyal, Mike...",2022-05-02,OPT: Open Pre-trained Transformer Language Models,...,,,,1654082.5,,Yes,Industry,,"Large language models, which are often trained...",2023-05-29 20:51:04
9,151,72,Yuan 1.0,Language,,Inspur,Industry,"Shaohua Wu, Xudong Zhao, Tong Yu, Rongguo Zhan...",2021-10-12,Yuan 1.0: Large-Scale Pre-trained Language Mod...,...,,,,606364.75,,Yes,Industry,,Recent work like GPT-3 has demonstrated excell...,2023-05-29 20:51:04


In [47]:
for i, row in top_20_compute_df.iterrows():
    print(i+1, row['Link'])

1 https://arxiv.org/abs/2303.08774
2 https://ai.google/static/documents/palm2techreport.pdf
3 https://arxiv.org/abs/2206.14858
4 https://arxiv.org/abs/2204.02311
5 https://www.microsoft.com/en-us/research/blog/using-deepspeed-and-megatron-to-train-megatron-turing-nlg-530b-the-worlds-largest-and-most-powerful-generative-language-model/
6 https://deepmind.com/blog/article/language-modelling-at-scale
7 https://arxiv.org/abs/2203.15556
8 https://arxiv.org/abs/2302.13971
9 https://ai.facebook.com/blog/democratizing-access-to-large-scale-language-models-with-opt-175b/
10 https://arxiv.org/abs/2110.04725
11 https://arxiv.org/pdf/2203.07814.pdf
12 https://arxiv.org/abs/2206.10789v1
13 https://uploads-ssl.webflow.com/60fd4503684b466578c0d307/61138924626a6981ee09caf6_jurassic_tech_paper.pdf
14 https://arxiv.org/abs/2201.08239
15 https://www.gwern.net/docs/ai/scaling/2021-10-11-xinzhiyuan-inspursource10gpt245b.html
16 https://arxiv.org/abs/2112.12731
17 https://medium.com/yandex/yandex-publishes-