In [29]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from collections import defaultdict
from datetime import datetime
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import pyalex
from pyalex import Works

from research_impact import plotting
from research_impact.plotting import save_plot
from research_impact.utils import dict_to_dataarray

In [31]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [32]:
pio.templates.default = "plotly_white"

In [33]:
data_file_location = 'data/innovations/'
os.makedirs(data_file_location, exist_ok=True)
snapshot_datestring = '2023-11-06'

result_file_location = 'results/innovations/'
os.makedirs(result_file_location, exist_ok=True)

In [34]:
# Load dataset

# Snapshot
origins_df = pd.read_csv(data_file_location + f"innovation_origins_snapshot_{snapshot_datestring}.csv")

# Live data
# sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
# tab_id = '578731623'
# data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
# origins_df = pd.read_csv(data_url)

In [35]:
origins_df.head()

Unnamed: 0,Algorithmic innovation,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
1,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
2,Transformer (general),Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
3,Sparse Attention,Generating Long Sequences with Sparse Transfor...,https://arxiv.org/abs/1904.10509,2019-Apr-23,OpenAI,"Rewon Child, Scott Gray, Alec Radford, Ilya Su..."
4,Linear Attention,Transformers are RNNs: Fast Autoregressive Tra...,http://proceedings.mlr.press/v119/katharopoulo...,2020-Jan-01,"Idiap Research Institute, Switzerland; EPFL, S...","Angelos Katharopoulos, Apoorv Vyas, Nikolaos P..."


In [36]:
origins_df.dropna(subset=['Origin affiliations'], inplace=True)
origins_df.head()

Unnamed: 0,Algorithmic innovation,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
1,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
2,Transformer (general),Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
3,Sparse Attention,Generating Long Sequences with Sparse Transfor...,https://arxiv.org/abs/1904.10509,2019-Apr-23,OpenAI,"Rewon Child, Scott Gray, Alec Radford, Ilya Su..."
4,Linear Attention,Transformers are RNNs: Fast Autoregressive Tra...,http://proceedings.mlr.press/v119/katharopoulo...,2020-Jan-01,"Idiap Research Institute, Switzerland; EPFL, S...","Angelos Katharopoulos, Apoorv Vyas, Nikolaos P..."


Create institution => origins mapping

In [37]:
institution_aliases = {
    'Google': 'Google',
    'Google Brain': 'Google',
    'Google Research': 'Google',
    'DeepMind': 'DeepMind',
    'Google DeepMind': 'DeepMind',
    'OpenAI': 'OpenAI',
    'Baidu Research': 'Baidu',
    'NVIDIA': 'NVIDIA',
    'Facebook AI Research': 'Meta',
    'Facebook AI': 'Meta',
    'Zhuiyi Technology Co., Ltd.': 'Zhuiyi',
    'Microsoft Research': 'Microsoft',
}

In [38]:
institution_key_algorithms = defaultdict(list)
for i, row in origins_df.iterrows():
    algorithm_name = row['Algorithmic innovation']
    affiliations = row['Origin affiliations']
    affiliations = [affiliation.strip() for affiliation in affiliations.split(';')]
    for affiliation in affiliations:
        if institution_aliases.get(affiliation) is not None:
            alias = institution_aliases[affiliation]
            institution_key_algorithms[alias].append(algorithm_name)
        else:
            print(affiliation, '- no alias')
institution_key_algorithms

Johns Hopkins University - no alias
University of Toronto - no alias
Idiap Research Institute, Switzerland - no alias
EPFL, Switzerland - no alias
University of Washington, Seattle - no alias
University of Geneva - no alias
UC Berkeley - no alias
University of Edinburgh - no alias
University of Zurich - no alias
University of Toronto - no alias
University of Toronto - no alias
Carnegie Mellon University - no alias
University of Washington - no alias
Allen Institute for AI - no alias
University of Toronto - no alias
University of Chicago - no alias
Toyota Technological Institute at Chicago - no alias
Jagiellonian University - no alias
University of Toronto - no alias
University of Freiburg - no alias
University of Amsterdam - no alias
University of Toronto - no alias
University of Freiburg - no alias
UC Berkeley - no alias
University of Toronto - no alias
University of Montreal - no alias
Brno University - no alias
Stanford University - no alias
University at Buffalo, SUNY - no alias
Un

defaultdict(list,
            {'OpenAI': ['Kaplan et al. scaling laws',
              'Sparse Attention',
              'Instruction tuning',
              'RLHF',
              'PPO',
              'Prompting for in-context learning'],
             'DeepMind': ['Hoffmann et al. scaling laws', 'RLHF', 'A2C'],
             'Google': ['Transformer (general)',
              'Transformer (general)',
              'Attention with locality-sensitive hashing',
              'Multi-Query Attention',
              'Grouped Query Attention',
              'LayerNorm',
              'Sinusoidal position embeddings',
              'Sinusoidal position embeddings',
              'Relative position embeddings',
              'SwiGLU activation',
              'Sparsely-Gated Mixture-of-Experts layer (MoE)',
              'Encoder-decoder Transformer',
              'Encoder-decoder Transformer',
              'Causal decoder Transformer (decoder-only)',
              'Language modeling task (with Tr

In [39]:
institution_key_algorithms_count = dict_to_dataarray(institution_key_algorithms, dim='institution', val_fn=len)
institution_key_algorithms_count

In [40]:
fig = go.Figure(data=[
    go.Bar(
        name='Key innovations',
        x=institution_key_algorithms_count.institution,
        y=institution_key_algorithms_count
    ),
])

## Plot layout
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Number of innovations for LLMs',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=400,
    height=300,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

plotting.prettify_bar_chart(fig, rotate_x_labels=False)
fig.update_layout(margin=dict(t=40))

## Save plot
save_plot(fig, result_file_location, 'num_key_innovations')

## Show plot
fig.show()

# Count occurrence of innovations directly

In [41]:
# Snapshot
occurrences_df = pd.read_csv(data_file_location + f"innovation_occurrences_snapshot_{snapshot_datestring}.csv", index_col='Algorithmic innovation')

# Live data
# sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
# tab_id = '1765093800'
# data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
# occurrences_df = pd.read_csv(data_url, index_col='Algorithm')

In [42]:
occurrences_df.head()

Unnamed: 0_level_0,GPT-4,PaLM 2,GPT-3.5,PaLM (540B),Megatron-Turing NLG (530B),ERNIE 3.0 Titan,LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),PanGu-Σ,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithmic innovation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Link,https://arxiv.org/abs/2303.08774,https://ai.google/static/documents/palm2techre...,https://platform.openai.com/docs/models/gpt-3-5,https://arxiv.org/abs/2204.02311,https://arxiv.org/abs/2201.11990,https://arxiv.org/abs/2112.12731,https://ai.meta.com/research/publications/llam...,https://arxiv.org/abs/2112.11446,https://arxiv.org/abs/2203.15556,,...,,,,https://arxiv.org/abs/2208.01448,https://huggingface.co/bigscience/bloom,https://research.facebook.com/publications/no-...,https://arxiv.org/abs/1909.08053,https://cdn.openai.com/better-language-models/...,https://cdn.openai.com/research-covers/languag...,https://proceedings.neurips.cc/paper_files/pap...
Included,0,0,0,1,1,1,0,1,1,1.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
Kaplan et al. scaling laws,,0,,1,1,0,0,1,0,0.0,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,,0,0,0,1,0,1,0.0,...,,,,1,0,0,,0,0,0
Transformer (general),,1,,1,1,1,1,1,1,1.0,...,,,,1,1,1,,1,1,1


In [43]:
# Replace '?' values with 0
occurrences_df = occurrences_df.replace('?', 0)
occurrences_df

Unnamed: 0_level_0,GPT-4,PaLM 2,GPT-3.5,PaLM (540B),Megatron-Turing NLG (530B),ERNIE 3.0 Titan,LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),PanGu-Σ,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithmic innovation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Link,https://arxiv.org/abs/2303.08774,https://ai.google/static/documents/palm2techre...,https://platform.openai.com/docs/models/gpt-3-5,https://arxiv.org/abs/2204.02311,https://arxiv.org/abs/2201.11990,https://arxiv.org/abs/2112.12731,https://ai.meta.com/research/publications/llam...,https://arxiv.org/abs/2112.11446,https://arxiv.org/abs/2203.15556,,...,,,,https://arxiv.org/abs/2208.01448,https://huggingface.co/bigscience/bloom,https://research.facebook.com/publications/no-...,https://arxiv.org/abs/1909.08053,https://cdn.openai.com/better-language-models/...,https://cdn.openai.com/research-covers/languag...,https://proceedings.neurips.cc/paper_files/pap...
Included,0,0,0,1,1,1,0,1,1,1.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
Kaplan et al. scaling laws,,0,,1,1,0,0,1,0,0.0,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,,0,0,0,1,0,1,0.0,...,,,,1,0,0,,0,0,0
Transformer (general),,1,,1,1,1,1,1,1,1.0,...,,,,1,1,1,,1,1,1
Sparse Attention,,,,0,0,0,0,0,0,0.0,...,,,,0,0,0,,0,0,0
Linear Attention,,,,0,0,0,0,0,0,0.0,...,,,,0,0,0,,0,0,0
Attention with locality-sensitive hashing,,,,0,0,0,0,0,0,0.0,...,,,,,,,,,,
Multi-Query Attention,,,,1,0,0,0,0,0,0.0,...,,,,,,,,0,0,0
Grouped Query Attention,,,,0,0,0,1,0,0,0.0,...,,,,,,,,0,0,0


In [44]:
keep_systems = ['Algorithmic innovation'] + occurrences_df.columns[occurrences_df.iloc[1].astype(int).astype(bool)].tolist()
keep_systems

['Algorithmic innovation',
 'PaLM (540B)',
 'Megatron-Turing NLG (530B)',
 'ERNIE 3.0 Titan',
 'Gopher (280B)',
 'Chinchilla (70B)',
 'PanGu-Σ',
 'LLaMA (65B)',
 'OPT-175B',
 'Yuan 1.0',
 'AlphaCode']

In [45]:
# Filter columns
selected_systems_occurrences_df = occurrences_df.filter(keep_systems)
selected_systems_occurrences_df

Unnamed: 0_level_0,PaLM (540B),Megatron-Turing NLG (530B),ERNIE 3.0 Titan,Gopher (280B),Chinchilla (70B),PanGu-Σ,LLaMA (65B),OPT-175B,Yuan 1.0,AlphaCode
Algorithmic innovation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Link,https://arxiv.org/abs/2204.02311,https://arxiv.org/abs/2201.11990,https://arxiv.org/abs/2112.12731,https://arxiv.org/abs/2112.11446,https://arxiv.org/abs/2203.15556,,https://arxiv.org/abs/2302.13971,https://arxiv.org/abs/2205.01068,https://arxiv.org/abs/2110.04725,https://arxiv.org/abs/2203.07814
Included,1,1,1,1,1,1.0,1,1,1,1
Kaplan et al. scaling laws,1,1,0,1,0,0.0,0,1,0,0
Hoffmann et al. scaling laws,0,0,0,0,1,0.0,1,0,0,0
Transformer (general),1,1,1,1,1,1.0,1,1,1,1
Sparse Attention,0,0,0,0,0,0.0,0,1,0,0
Linear Attention,0,0,0,0,0,0.0,0,0,0,0
Attention with locality-sensitive hashing,0,0,0,0,0,0.0,0,0,0,0
Multi-Query Attention,1,0,0,0,0,0.0,0,0,0,1
Grouped Query Attention,0,0,0,0,0,0.0,0,0,0,0


In [46]:
selected_systems_occurrences_df = selected_systems_occurrences_df.drop(['Link', 'Included'])
selected_systems_occurrences_df

Unnamed: 0_level_0,PaLM (540B),Megatron-Turing NLG (530B),ERNIE 3.0 Titan,Gopher (280B),Chinchilla (70B),PanGu-Σ,LLaMA (65B),OPT-175B,Yuan 1.0,AlphaCode
Algorithmic innovation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Kaplan et al. scaling laws,1,1,0,1,0,0,0,1,0,0
Hoffmann et al. scaling laws,0,0,0,0,1,0,1,0,0,0
Transformer (general),1,1,1,1,1,1,1,1,1,1
Sparse Attention,0,0,0,0,0,0,0,1,0,0
Linear Attention,0,0,0,0,0,0,0,0,0,0
Attention with locality-sensitive hashing,0,0,0,0,0,0,0,0,0,0
Multi-Query Attention,1,0,0,0,0,0,0,0,0,1
Grouped Query Attention,0,0,0,0,0,0,0,0,0,0
RMSNorm,0,0,0,1,1,0,1,0,0,0
LayerNorm,1,1,1,0,0,1,0,1,1,1


In [47]:
# Make values integers
selected_systems_occurrences_df = selected_systems_occurrences_df.astype(int)

In [48]:
# Sum each row
total_occurrences_by_innovation = selected_systems_occurrences_df.sum(axis=1)
total_occurrences_by_innovation

Algorithmic innovation
Kaplan et al. scaling laws                                      4
Hoffmann et al. scaling laws                                    2
Transformer (general)                                          10
Sparse Attention                                                1
Linear Attention                                                0
Attention with locality-sensitive hashing                       0
Multi-Query Attention                                           2
Grouped Query Attention                                         0
RMSNorm                                                         3
LayerNorm                                                       7
Pre-normalization                                               8
Learnable position embeddings                                   4
Sinusoidal position embeddings                                  1
Relative position embeddings                                    3
Rotary position embeddings                           

In [49]:
institution_key_algorithm_occurrences = defaultdict(dict)
for innovation, occurrence_count in total_occurrences_by_innovation.items():
    if occurrence_count == 0:
        continue
    print(innovation)
    matching_origin = origins_df[origins_df['Algorithmic innovation'] == innovation]
    if len(matching_origin) == 0:
        continue
    origin_row_number = matching_origin.index[0]
    origin_affiliations = origins_df.loc[origin_row_number]['Origin affiliations']
    origin_affiliations = [affiliation.strip() for affiliation in origin_affiliations.split(';')]
    print(origin_affiliations)
    for affiliation in origin_affiliations:
        alias = institution_aliases.get(affiliation)
        print(alias)
        if alias is not None:
            institution_key_algorithm_occurrences[innovation][alias] = occurrence_count
        else:
            print(f'No alias for {affiliation}')
    print(institution_key_algorithm_occurrences)
    print()

institution_key_algorithm_occurrences

Kaplan et al. scaling laws
['Johns Hopkins University', 'OpenAI']
None
No alias for Johns Hopkins University
OpenAI
defaultdict(<class 'dict'>, {'Kaplan et al. scaling laws': {'OpenAI': 4}})

Hoffmann et al. scaling laws
['Google DeepMind']
DeepMind
defaultdict(<class 'dict'>, {'Kaplan et al. scaling laws': {'OpenAI': 4}, 'Hoffmann et al. scaling laws': {'DeepMind': 2}})

Transformer (general)
['Google Brain', 'Google Research', 'University of Toronto']
Google
Google
None
No alias for University of Toronto
defaultdict(<class 'dict'>, {'Kaplan et al. scaling laws': {'OpenAI': 4}, 'Hoffmann et al. scaling laws': {'DeepMind': 2}, 'Transformer (general)': {'Google': 10}})

Sparse Attention
['OpenAI']
OpenAI
defaultdict(<class 'dict'>, {'Kaplan et al. scaling laws': {'OpenAI': 4}, 'Hoffmann et al. scaling laws': {'DeepMind': 2}, 'Transformer (general)': {'Google': 10}, 'Sparse Attention': {'OpenAI': 1}})

Multi-Query Attention
['Google']
Google
defaultdict(<class 'dict'>, {'Kaplan et al. sc

defaultdict(dict,
            {'Kaplan et al. scaling laws': {'OpenAI': 4},
             'Hoffmann et al. scaling laws': {'DeepMind': 2},
             'Transformer (general)': {'Google': 10},
             'Sparse Attention': {'OpenAI': 1},
             'Multi-Query Attention': {'Google': 2},
             'LayerNorm': {'Google': 7},
             'Pre-normalization': {'Meta': 8},
             'Learnable position embeddings': {'Meta': 4},
             'Sinusoidal position embeddings': {'Google': 1},
             'Relative position embeddings': {'Google': 3},
             'Rotary position embeddings': {'Zhuiyi': 2},
             'SwiGLU activation': {'Google': 2},
             'Sparsely-Gated Mixture-of-Experts layer (MoE)': {'Google': 1},
             'Encoder-decoder Transformer': {'Google': 3},
             'Causal decoder Transformer (decoder-only)': {'Google': 7},
             'Language modeling task (with Transformer architecture)': {'Google': 10},
             'Cloze task (with Tran

In [50]:
i = 0
for innovation, occurrence_count in total_occurrences_by_innovation.items():
    if innovation in institution_key_algorithm_occurrences:
        i += 1
        print(i, innovation)

1 Kaplan et al. scaling laws
2 Hoffmann et al. scaling laws
3 Transformer (general)
4 Sparse Attention
5 Multi-Query Attention
6 LayerNorm
7 Pre-normalization
8 Learnable position embeddings
9 Sinusoidal position embeddings
10 Relative position embeddings
11 Rotary position embeddings
12 SwiGLU activation
13 Sparsely-Gated Mixture-of-Experts layer (MoE)
14 Encoder-decoder Transformer
15 Causal decoder Transformer (decoder-only)
16 Language modeling task (with Transformer architecture)
17 Cloze task (with Transformer architecture)
18 Denoising autoencoding task (with Transformer architecture)
19 Dynamic batch size
20 Adafactor optimizer
21 Mixed precision training
22 Instruction tuning
23 RLHF
24 A2C
25 Prompting for in-context learning
26 Chain-of-thought


In [51]:
# Generate the stacked bar chart
fig = go.Figure()

annotations = []
company_count_stack = defaultdict(int)

# Add bar traces
for innovation, company_counts in sorted(institution_key_algorithm_occurrences.items(), key=lambda x: min(x[1].values())):
    y_values = list(company_counts.values())
    x_values = list(company_counts.keys())
    fig.add_trace(go.Bar(name=innovation, x=x_values, y=y_values))

    # Add annotations
    for x, y in zip(x_values, y_values):
        start_y = company_count_stack[x]
        end_y = start_y + y
        middle_y = (start_y + end_y) / 2
        company_count_stack[x] = end_y
        annotation_font = dict(size=10)
        arrowcolor = '#1F95BD'
        if 'Transformer (general)' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-35, showarrow=True, text='Transformer', font=annotation_font, arrowcolor=arrowcolor))
        elif 'LayerNorm' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-60, showarrow=True, text='LayerNorm', font=annotation_font, arrowcolor=arrowcolor))
        elif 'SwiGLU' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-90, showarrow=True, text='SwiGLU activation', font=annotation_font, arrowcolor=arrowcolor))
        # elif 'In-context' in innovation:
        #     annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-45, showarrow=True, text='Prompting for in-context learning', font=annotation_font, arrowcolor=arrowcolor))
        elif 'Instruction' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-65, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Sparse Attention' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-70, showarrow=True, text='Sparse attention', font=annotation_font, arrowcolor=arrowcolor))
        elif 'Pre-' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=25, ay=-25, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Hoff' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=25, ay=-25, showarrow=True, text='Chinchilla scaling laws', font=annotation_font, arrowcolor=arrowcolor))
        elif 'Mixed' in innovation and x == 'NVIDIA':
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=15, ay=-15, showarrow=True, text='Mixed-precision training', font=annotation_font, arrowcolor=arrowcolor))


## Plot layout
fig.update_layout(
    barmode='stack',
    xaxis={'categoryorder':'total descending'},
    # title='Occurrence of innovations in the top 10 largest LMs',
    xaxis_title='Company responsible for innovation',
    yaxis_title='Adoption frequency in 10 largest LMs',
    showlegend=False,
    annotations=annotations,
    autosize=False,
    width=480,
    height=250,
    # height=360,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=30, b=20),
)

plotting.prettify_bar_chart(fig, rotate_x_labels=False)
fig.update_layout(margin=dict(t=40))
fig.update_yaxes(range=[0, 59])

## Save plot
save_plot(fig, result_file_location, 'key_innovations_occurrence')

# Show the figure
fig.show()

In [52]:
innovation_occurrence_by_institution = defaultdict(dict)
for innovation, institution_counts in institution_key_algorithm_occurrences.items():
    for institution, count in institution_counts.items():
        innovation_occurrence_by_institution[institution][innovation] = count
innovation_occurrence_by_institution

defaultdict(dict,
            {'OpenAI': {'Kaplan et al. scaling laws': 4,
              'Sparse Attention': 1,
              'Instruction tuning': 4,
              'RLHF': 1,
              'Prompting for in-context learning': 9},
             'DeepMind': {'Hoffmann et al. scaling laws': 2,
              'RLHF': 1,
              'A2C': 1},
             'Google': {'Transformer (general)': 10,
              'Multi-Query Attention': 2,
              'LayerNorm': 7,
              'Sinusoidal position embeddings': 1,
              'Relative position embeddings': 3,
              'SwiGLU activation': 2,
              'Sparsely-Gated Mixture-of-Experts layer (MoE)': 1,
              'Encoder-decoder Transformer': 3,
              'Causal decoder Transformer (decoder-only)': 7,
              'Language modeling task (with Transformer architecture)': 10,
              'Cloze task (with Transformer architecture)': 1,
              'Dynamic batch size': 5,
              'Adafactor optimizer': 1,
 

In [53]:
num_institutions = len(innovation_occurrence_by_institution.keys())
# Manual way to order the institutions
institution_order = ['Google', 'OpenAI', 'Meta', 'DeepMind', 'Zhuiyi', 'NVIDIA', 'Baidu']

# Create subplots with shared y-axis
fig = make_subplots(
    rows=1,
    cols=num_institutions,
    shared_yaxes=True,
    column_widths=[0.5 * len(innovation_occurrence_by_institution[institution]) for institution in institution_order],
    horizontal_spacing=0,
)

# Add traces
for i, institution in enumerate(institution_order):
    innovation_counts = innovation_occurrence_by_institution[institution]
    x = list(innovation_counts.keys())
    y = list(innovation_counts.values())
    texts = [xi if yi > 7 else None for xi, yi in zip(x, y)]
    fig.add_trace(
        go.Bar(
            x=x,
            y=y,
            name=f'{institution} ({sum(y)} total)',
            text=texts,
            textfont=dict(color='white', size=12),
            # marker_color='#636EFA',
            width=1
        ), 
        row=1,
        col=i+1
    )
    fig.update_xaxes(
        categoryorder='total descending',
        showticklabels=False,
        ticks='',
        col=i+1
    )        

## Plot layout
fig.update_layout(
    xaxis={'categoryorder':'total descending'},
    # title='Occurrence of innovations in the top 10 largest LMs',
    # xaxis_title='Company responsible for innovation',
    yaxis_title='Number of occurrences',
    legend=dict(
        orientation='h',
        y=0,
        x=0,
    ),
    autosize=False,
    width=480,
    height=250,
    # height=360,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=0, r=0, t=0, b=0),
)

save_plot(fig, result_file_location, 'key_innovations_occurrence_grouped')

fig.show()


In [54]:
institution_key_algorithm_occurrences

defaultdict(dict,
            {'Kaplan et al. scaling laws': {'OpenAI': 4},
             'Hoffmann et al. scaling laws': {'DeepMind': 2},
             'Transformer (general)': {'Google': 10},
             'Sparse Attention': {'OpenAI': 1},
             'Multi-Query Attention': {'Google': 2},
             'LayerNorm': {'Google': 7},
             'Pre-normalization': {'Meta': 8},
             'Learnable position embeddings': {'Meta': 4},
             'Sinusoidal position embeddings': {'Google': 1},
             'Relative position embeddings': {'Google': 3},
             'Rotary position embeddings': {'Zhuiyi': 2},
             'SwiGLU activation': {'Google': 2},
             'Sparsely-Gated Mixture-of-Experts layer (MoE)': {'Google': 1},
             'Encoder-decoder Transformer': {'Google': 3},
             'Causal decoder Transformer (decoder-only)': {'Google': 7},
             'Language modeling task (with Transformer architecture)': {'Google': 10},
             'Cloze task (with Tran

In [55]:
company_names = set()
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    company_names.update(list(company_counts.keys()))
company_names = list(company_names)
labels = company_names
parents = [''] * len(company_names)

In [56]:
company_values = defaultdict(list)
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    for company_name, count in company_counts.items():
        company_values[company_name].append(count)

In [57]:
values = [sum(company_values[company_name]) for company_name in company_names]

In [58]:
print(labels)
print(parents)
print(values)

['Meta', 'Baidu', 'NVIDIA', 'Zhuiyi', 'Google', 'DeepMind', 'OpenAI']
['', '', '', '', '', '', '']
[13, 3, 3, 2, 55, 4, 19]


In [59]:
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    for company_name, count in company_counts.items():
        labels.append(innovation)
        parents.append(company_name)
        values.append(count)

In [60]:
print(labels)
print(parents)
print(values)

['Meta', 'Baidu', 'NVIDIA', 'Zhuiyi', 'Google', 'DeepMind', 'OpenAI', 'Kaplan et al. scaling laws', 'Hoffmann et al. scaling laws', 'Transformer (general)', 'Sparse Attention', 'Multi-Query Attention', 'LayerNorm', 'Pre-normalization', 'Learnable position embeddings', 'Sinusoidal position embeddings', 'Relative position embeddings', 'Rotary position embeddings', 'SwiGLU activation', 'Sparsely-Gated Mixture-of-Experts layer (MoE)', 'Encoder-decoder Transformer', 'Causal decoder Transformer (decoder-only)', 'Language modeling task (with Transformer architecture)', 'Cloze task (with Transformer architecture)', 'Denoising autoencoding task (with Transformer architecture)', 'Dynamic batch size', 'Adafactor optimizer', 'Mixed precision training', 'Mixed precision training', 'Instruction tuning', 'RLHF', 'RLHF', 'A2C', 'Prompting for in-context learning', 'Chain-of-thought']
['', '', '', '', '', '', '', 'OpenAI', 'DeepMind', 'Google', 'OpenAI', 'Google', 'Google', 'Meta', 'Meta', 'Google', 'G

In [61]:
fig = go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    values=values,
    textinfo="label+value",
))

## Plot layout
fig.update_layout(
    width=480,
    height=360,
    font=dict(size=12),
    # uniformtext=dict(minsize=6, mode='hide'),
    margin=dict(l=20, r=20, t=30, b=20),
)

fig.show()

In [62]:
import plotly.graph_objects as go

# Sample Data
labels = ["Google", "OpenAI", "Meta", 
          "Algorithm A", "Algorithm B", "Algorithm C", 
          "Algorithm D", "Algorithm E",
          "Algorithm F"]

parents = ["", "", "", 
           "Google", "Google", "Google", 
           "OpenAI", "OpenAI",
           "Meta"]

values = [40, 20, 20,   # Company total values
          10, 10, 20,   # Google's algorithm values
          10, 10,       # OpenAI's algorithm values
          20]           # Meta's algorithm value

fig = go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    values=values,
    marker_colors=["blue", "purple", "green"] + ["lightblue"]*3 + ["lightpurple"]*2 + ["lightgreen"],
    textinfo="label+value"
))

## Plot layout
fig.update_layout(
    width=480,
    # height=250,
    height=360,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=30, b=20),
)

fig.show()
