In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from collections import defaultdict
from datetime import datetime
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import pyalex
from pyalex import Works

from researcher_impact.plotting import save_plot
from researcher_impact.utils import dict_to_dataarray

In [4]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [5]:
pio.templates.default = "plotly_white"

In [6]:
result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

In [7]:
# Download dataset
sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
tab_id = '578731623'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
origins_df = pd.read_csv(data_url)

In [8]:
origins_df.head()

Unnamed: 0,Algorithm,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Transformer,Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
1,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
2,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
3,Causal objective,Improving Language Understanding by Generative...,https://openai.com/research/language-unsupervised,2018-Jun-11,OpenAI,"Alec Radford, Karthik Narasimhan, Tim Salimans..."
4,Masked/denoising objective,BERT: Pre-training of Deep Bidirectional Trans...,https://arxiv.org/abs/1810.04805,2018-Oct-11,Google,"Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kris..."


In [9]:
origins_df.dropna(subset=['Origin affiliations'], inplace=True)
origins_df.head()

Unnamed: 0,Algorithm,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Transformer,Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
1,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
2,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
3,Causal objective,Improving Language Understanding by Generative...,https://openai.com/research/language-unsupervised,2018-Jun-11,OpenAI,"Alec Radford, Karthik Narasimhan, Tim Salimans..."
4,Masked/denoising objective,BERT: Pre-training of Deep Bidirectional Trans...,https://arxiv.org/abs/1810.04805,2018-Oct-11,Google,"Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kris..."


Create institution => origins mapping

In [10]:
institution_aliases = {
    'Google': 'Google',
    'Google Brain': 'Google',
    'Google Research': 'Google',
    'DeepMind': 'DeepMind',
    'Google DeepMind': 'DeepMind',
    'OpenAI': 'OpenAI',
    'Baidu Research': 'Baidu',
    'NVIDIA': 'NVIDIA',
    'Facebook AI Research': 'Meta',
    'Zhuiyi Technology Co., Ltd.': 'Zhuiyi',
}

In [11]:
institution_key_algorithms = defaultdict(list)
for i, row in origins_df.iterrows():
    algorithm_name = row['Algorithm']
    affiliations = row['Origin affiliations']
    affiliations = [affiliation.strip() for affiliation in affiliations.split(';')]
    for affiliation in affiliations:
        if institution_aliases.get(affiliation) is not None:
            alias = institution_aliases[affiliation]
            institution_key_algorithms[alias].append(algorithm_name)
institution_key_algorithms

defaultdict(list,
            {'Google': ['Transformer',
              'Transformer',
              'Masked/denoising objective',
              'Chain-of-thought',
              'Decoder-only architecture',
              'LayerNorm',
              'Sinusoidal position embeddings',
              'Sinusoidal position embeddings',
              'Relative position encodings',
              'SwiGLU activation',
              'Sparsely-Gated Mixture-of-Experts layer (MoE)',
              'Multi-Query Attention',
              'Grouped Query Attention',
              'Dynamic batch size'],
             'OpenAI': ['Kaplan et al. scaling laws',
              'Causal objective',
              'Instruction tuning',
              'RLHF',
              'PPO',
              'Sparse Attention'],
             'DeepMind': ['Hoffmann et al. scaling laws', 'RLHF'],
             'Baidu': ['Mixed precision training'],
             'NVIDIA': ['Mixed precision training'],
             'Meta': ['Pre-normaliza

In [12]:
institution_key_algorithms_count = dict_to_dataarray(institution_key_algorithms, dim='institution', val_fn=len)
institution_key_algorithms_count

In [13]:
fig = go.Figure(data=[
    go.Bar(
        name='Key innovations',
        x=institution_key_algorithms_count.institution,
        y=institution_key_algorithms_count
    ),
])

## Plot layout
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Number of innovations for LLMs',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=400,
    height=300,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'num_key_innovations')

## Show plot
fig.show()

# Count occurrence of innovations directly

In [14]:
# Download dataset
sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
tab_id = '1765093800'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
occurrences_df = pd.read_csv(data_url, index_col='Algorithm')

In [15]:
occurrences_df.head()

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,https://ai.google/static/documents/palm2techre...,https://arxiv.org/abs/2206.14858,https://arxiv.org/abs/2204.02311,https://arxiv.org/abs/2201.11990,https://ai.meta.com/research/publications/llam...,https://arxiv.org/abs/2112.11446,https://arxiv.org/abs/2203.15556,https://arxiv.org/abs/2302.13971,https://arxiv.org/abs/2205.01068,...,,,,https://arxiv.org/abs/2208.01448,https://huggingface.co/bigscience/bloom,https://research.facebook.com/publications/no-...,https://arxiv.org/abs/1909.08053,https://cdn.openai.com/better-language-models/...,https://cdn.openai.com/research-covers/languag...,https://proceedings.neurips.cc/paper_files/pap...
Transformer,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Kaplan et al. scaling laws,,0,1,1,1,0,1,0,0,1,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,0,0,0,1,0,1,1,0,...,,,,1,0,0,,0,0,0
Causal objective,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1


In [16]:
occurrences_df = occurrences_df[occurrences_df.index.notna()]
occurrences_df

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Transformer,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Kaplan et al. scaling laws,,0,1,1,1,0,1,0,0,1,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,0,0,0,1,0,1,1,0,...,,,,1,0,0,,0,0,0
Causal objective,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Masked/denoising objective,,1,0,0,0,0,0,0,0,0,...,,,,1,0,1,,0,0,0
FlashAttention,,0,0,0,0,1,0,0,1,0,...,,,,0,0,0,,0,0,0
Instruction tuning,,1,0,0,0,1,0,0,1,0,...,,,,0,0,0,,0,0,0
RLHF,,0,0,0,0,1,0,0,0,0,...,,,,0,0,0,,0,0,0
PPO,,0,0,0,0,1,0,0,0,0,...,,,,0,0,0,,0,0,0
Chain-of-thought,,1,1,1,0,0,0,0,0,0,...,,,,1,0,0,,0,0,0


In [17]:
# Replace '?' values with 0
occurrences_df = occurrences_df.replace('?', 0)
occurrences_df

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Transformer,,1.0,1,1,1,1,1,1,1,1,...,,,,1.0,1.0,1.0,,1,1,1
Kaplan et al. scaling laws,,0.0,1,1,1,0,1,0,0,1,...,,,,0.0,1.0,0.0,,0,0,0
Hoffmann et al. scaling laws,,1.0,0,0,0,1,0,1,1,0,...,,,,1.0,0.0,0.0,,0,0,0
Causal objective,,1.0,1,1,1,1,1,1,1,1,...,,,,1.0,1.0,1.0,,1,1,1
Masked/denoising objective,,1.0,0,0,0,0,0,0,0,0,...,,,,1.0,0.0,1.0,,0,0,0
FlashAttention,,0.0,0,0,0,1,0,0,1,0,...,,,,0.0,0.0,0.0,,0,0,0
Instruction tuning,,1.0,0,0,0,1,0,0,1,0,...,,,,0.0,0.0,0.0,,0,0,0
RLHF,,0.0,0,0,0,1,0,0,0,0,...,,,,0.0,0.0,0.0,,0,0,0
PPO,,0.0,0,0,0,1,0,0,0,0,...,,,,0.0,0.0,0.0,,0,0,0
Chain-of-thought,,1.0,1,1,0,0,0,0,0,0,...,,,,1.0,0.0,0.0,,0,0,0


In [18]:
# Filter columns
keep_systems = ['Algorithm', 'PaLM (540B)', 'Megatron-Turing NLG (530B)', 'LLaMA 2 (70B)', 'Gopher (280B)', 'Chinchilla (70B)', 'LLaMA (65B)', 'OPT-175B', 'Yuan 1.0', 'AlphaCode', 'GPT-3 (175B)']
selected_systems_occurrences_df = occurrences_df.filter(keep_systems)
selected_systems_occurrences_df

Unnamed: 0_level_0,PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,Yuan 1.0,AlphaCode,GPT-3 (175B)
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Transformer,1,1,1,1,1,1,1,1,1,1
Kaplan et al. scaling laws,1,1,0,1,0,0,1,0,0,1
Hoffmann et al. scaling laws,0,0,1,0,1,1,0,0,0,0
Causal objective,1,1,1,1,1,1,1,1,1,1
Masked/denoising objective,0,0,0,0,0,0,0,0,1,0
FlashAttention,0,0,1,0,0,1,0,0,0,0
Instruction tuning,0,0,1,0,0,1,0,0,0,0
RLHF,0,0,1,0,0,0,0,0,0,0
PPO,0,0,1,0,0,0,0,0,0,0
Chain-of-thought,1,0,0,0,0,0,0,0,0,0


In [19]:
# Make values integers
selected_systems_occurrences_df = selected_systems_occurrences_df.astype(int)

In [20]:
# Sum each row
total_occurrences_by_innovation = selected_systems_occurrences_df.sum(axis=1)
total_occurrences_by_innovation

Algorithm
Transformer                                         10
Kaplan et al. scaling laws                           5
Hoffmann et al. scaling laws                         3
Causal objective                                    10
Masked/denoising objective                           1
FlashAttention                                       2
Instruction tuning                                   2
RLHF                                                 1
PPO                                                  1
Chain-of-thought                                     1
Mixed precision training                             2
Decoder-only architecture                            7
RMSNorm                                              4
LayerNorm                                            6
Pre-normalization                                    9
Learnable position embeddings                        4
Sinusoidal position embeddings                       1
Relative position encodings                          2


In [21]:
len(origins_df[origins_df['Algorithm'] == 'GELU'])

0

In [22]:
institution_key_algorithm_occurrences = defaultdict(dict)
for innovation, occurrence_count in total_occurrences_by_innovation.items():
    if occurrence_count == 0:
        continue
    print(innovation)
    matching_origin = origins_df[origins_df['Algorithm'] == innovation]
    if len(matching_origin) == 0:
        continue
    origin_row_number = matching_origin.index[0]
    origin_affiliations = origins_df.loc[origin_row_number]['Origin affiliations']
    origin_affiliations = [affiliation.strip() for affiliation in origin_affiliations.split(';')]
    print(origin_affiliations)
    for affiliation in origin_affiliations:
        alias = institution_aliases.get(affiliation)
        print(alias)
        if alias is not None:
            institution_key_algorithm_occurrences[innovation][alias] = occurrence_count
        else:
            print(f'No alias for {affiliation}')
    print(institution_key_algorithm_occurrences)
    print()

institution_key_algorithm_occurrences

Transformer
['Google Brain', 'Google Research', 'University of Toronto']
Google
Google
None
No alias for University of Toronto
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}})

Kaplan et al. scaling laws
['Johns Hopkins University', 'OpenAI']
None
No alias for Johns Hopkins University
OpenAI
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}})

Hoffmann et al. scaling laws
['Google DeepMind']
DeepMind
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}, 'Hoffmann et al. scaling laws': {'DeepMind': 3}})

Causal objective
['OpenAI']
OpenAI
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}, 'Hoffmann et al. scaling laws': {'DeepMind': 3}, 'Causal objective': {'OpenAI': 10}})

Masked/denoising objective
['Google']
Google
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5

defaultdict(dict,
            {'Transformer': {'Google': 10},
             'Kaplan et al. scaling laws': {'OpenAI': 5},
             'Hoffmann et al. scaling laws': {'DeepMind': 3},
             'Causal objective': {'OpenAI': 10},
             'Masked/denoising objective': {'Google': 1},
             'Instruction tuning': {'OpenAI': 2},
             'RLHF': {'OpenAI': 1, 'DeepMind': 1},
             'PPO': {'OpenAI': 1},
             'Chain-of-thought': {'Google': 1},
             'Mixed precision training': {'Baidu': 2, 'NVIDIA': 2},
             'Decoder-only architecture': {'Google': 7},
             'LayerNorm': {'Google': 6},
             'Pre-normalization': {'Meta': 9},
             'Learnable position embeddings': {'Meta': 4},
             'Sinusoidal position embeddings': {'Google': 1},
             'Relative position encodings': {'Google': 2},
             'Rotary position embeddings': {'Zhuiyi': 3},
             'SwiGLU activation': {'Google': 3},
             'Sparse Attent

In [305]:
# Generate the stacked bar chart
fig = go.Figure()

annotations = []
company_count_stack = defaultdict(int)

# Add bar traces
for innovation, company_counts in sorted(institution_key_algorithm_occurrences.items(), key=lambda x: min(x[1].values())):
    y_values = list(company_counts.values())
    x_values = list(company_counts.keys())
    fig.add_trace(go.Bar(name=innovation, x=x_values, y=y_values, marker=dict(color='#636EFA'))) # 09323A

    for x, y in zip(x_values, y_values):
        start_y = company_count_stack[x]
        end_y = start_y + y
        middle_y = (start_y + end_y) / 2
        company_count_stack[x] = end_y
        annotation_color = '#888888'
        annotation_font = dict(size=9, color=annotation_color)
        arrowcolor = annotation_color
        if 'Transformer' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-15, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'LayerNorm' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-72, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Causal' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-35, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Instruction' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-62, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Pre-' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-15, showarrow=True, text=innovation, font=annotation_font, arrowcolor=arrowcolor))
        elif 'Hoff' in innovation:
            annotations.append(dict(x=x, y=middle_y, xanchor='left', ax=35, ay=-15, showarrow=True, text='Chinchilla scaling laws', font=annotation_font, arrowcolor=arrowcolor))


## Plot layout
fig.update_layout(
    barmode='stack',
    xaxis={'categoryorder':'total descending'},
    # title='Occurrence of innovations in the top 10 largest LMs',
    xaxis_title='Company responsible for innovation',
    yaxis_title='Occurrences in largest LMs',
    showlegend=False,
    annotations=annotations,
    autosize=False,
    width=480,
    height=250,
    # height=360,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=30, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'key_innovations_occurrence')

# Show the figure
fig.show()

In [70]:
innovation_occurrence_by_institution = defaultdict(dict)
for innovation, institution_counts in institution_key_algorithm_occurrences.items():
    for institution, count in institution_counts.items():
        innovation_occurrence_by_institution[institution][innovation] = count
innovation_occurrence_by_institution

defaultdict(dict,
            {'Google': {'Transformer': 10,
              'Masked/denoising objective': 1,
              'Chain-of-thought': 1,
              'Decoder-only architecture': 7,
              'LayerNorm': 6,
              'Sinusoidal position embeddings': 1,
              'Relative position encodings': 2,
              'SwiGLU activation': 3,
              'Multi-Query Attention': 2,
              'Grouped Query Attention': 1,
              'Dynamic batch size': 6},
             'OpenAI': {'Kaplan et al. scaling laws': 5,
              'Causal objective': 10,
              'Instruction tuning': 2,
              'RLHF': 1,
              'PPO': 1,
              'Sparse Attention': 2},
             'DeepMind': {'Hoffmann et al. scaling laws': 3, 'RLHF': 1},
             'Baidu': {'Mixed precision training': 2},
             'NVIDIA': {'Mixed precision training': 2},
             'Meta': {'Pre-normalization': 9,
              'Learnable position embeddings': 4},
             '

In [271]:
num_institutions = len(innovation_occurrence_by_institution.keys())
# Manual way to order the institutions
institution_order = ['Google', 'OpenAI', 'Meta', 'DeepMind', 'Zhuiyi', 'NVIDIA', 'Baidu']

# Create subplots with shared y-axis
fig = make_subplots(
    rows=1,
    cols=num_institutions,
    shared_yaxes=True,
    column_widths=[0.5 * len(innovation_occurrence_by_institution[institution]) for institution in institution_order],
    horizontal_spacing=0,
)

# Add traces
for i, institution in enumerate(institution_order):
    innovation_counts = innovation_occurrence_by_institution[institution]
    x = list(innovation_counts.keys())
    y = list(innovation_counts.values())
    texts = [xi if yi > 7 else None for xi, yi in zip(x, y)]
    fig.add_trace(
        go.Bar(
            x=x,
            y=y,
            name=f'{institution} ({sum(y)} total)',
            text=texts,
            textfont=dict(color='white', size=12),
            # marker_color='#636EFA',
            width=1
        ), 
        row=1,
        col=i+1
    )
    fig.update_xaxes(
        categoryorder='total descending',
        showticklabels=False,
        ticks='',
        col=i+1
    )        

## Plot layout
fig.update_layout(
    xaxis={'categoryorder':'total descending'},
    # title='Occurrence of innovations in the top 10 largest LMs',
    # xaxis_title='Company responsible for innovation',
    yaxis_title='Number of occurrences',
    legend=dict(
        orientation='h',
        y=0,
        x=0,
    ),
    autosize=False,
    width=480,
    height=250,
    # height=360,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=0, r=0, t=0, b=0),
)

save_plot(fig, result_file_location, 'key_innovations_occurrence_grouped')

fig.show()


In [27]:
institution_key_algorithm_occurrences

defaultdict(dict,
            {'Transformer': {'Google': 10},
             'Kaplan et al. scaling laws': {'OpenAI': 5},
             'Hoffmann et al. scaling laws': {'DeepMind': 3},
             'Causal objective': {'OpenAI': 10},
             'Masked/denoising objective': {'Google': 1},
             'Instruction tuning': {'OpenAI': 2},
             'RLHF': {'OpenAI': 1, 'DeepMind': 1},
             'PPO': {'OpenAI': 1},
             'Chain-of-thought': {'Google': 1},
             'Mixed precision training': {'Baidu': 2, 'NVIDIA': 2},
             'Decoder-only architecture': {'Google': 7},
             'LayerNorm': {'Google': 6},
             'Pre-normalization': {'Meta': 9},
             'Learnable position embeddings': {'Meta': 4},
             'Sinusoidal position embeddings': {'Google': 1},
             'Relative position encodings': {'Google': 2},
             'Rotary position embeddings': {'Zhuiyi': 3},
             'SwiGLU activation': {'Google': 3},
             'Sparse Attent

In [28]:
company_names = set()
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    company_names.update(list(company_counts.keys()))
company_names = list(company_names)
labels = company_names
parents = [''] * len(company_names)

In [29]:
company_values = defaultdict(list)
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    for company_name, count in company_counts.items():
        company_values[company_name].append(count)

In [30]:
values = [sum(company_values[company_name]) for company_name in company_names]

In [31]:
print(labels)
print(parents)
print(values)

['Meta', 'Zhuiyi', 'Google', 'DeepMind', 'OpenAI', 'Baidu', 'NVIDIA']
['', '', '', '', '', '', '']
[13, 3, 40, 4, 21, 2, 2]


In [32]:
for innovation, company_counts in institution_key_algorithm_occurrences.items():
    for company_name, count in company_counts.items():
        labels.append(innovation)
        parents.append(company_name)
        values.append(count)

In [33]:
print(labels)
print(parents)
print(values)

['Meta', 'Zhuiyi', 'Google', 'DeepMind', 'OpenAI', 'Baidu', 'NVIDIA', 'Transformer', 'Kaplan et al. scaling laws', 'Hoffmann et al. scaling laws', 'Causal objective', 'Masked/denoising objective', 'Instruction tuning', 'RLHF', 'RLHF', 'PPO', 'Chain-of-thought', 'Mixed precision training', 'Mixed precision training', 'Decoder-only architecture', 'LayerNorm', 'Pre-normalization', 'Learnable position embeddings', 'Sinusoidal position embeddings', 'Relative position encodings', 'Rotary position embeddings', 'SwiGLU activation', 'Sparse Attention', 'Multi-Query Attention', 'Grouped Query Attention', 'Dynamic batch size']
['', '', '', '', '', '', '', 'Google', 'OpenAI', 'DeepMind', 'OpenAI', 'Google', 'OpenAI', 'OpenAI', 'DeepMind', 'OpenAI', 'Google', 'Baidu', 'NVIDIA', 'Google', 'Google', 'Meta', 'Meta', 'Google', 'Google', 'Zhuiyi', 'Google', 'OpenAI', 'Google', 'Google', 'Google']
[13, 3, 40, 4, 21, 2, 2, 10, 5, 3, 10, 1, 2, 1, 1, 1, 1, 2, 2, 7, 6, 9, 4, 1, 2, 3, 3, 2, 2, 1, 6]


In [34]:
fig = go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    values=values,
    textinfo="label+value",
))

## Plot layout
fig.update_layout(
    width=480,
    height=360,
    font=dict(size=12),
    # uniformtext=dict(minsize=6, mode='hide'),
    margin=dict(l=20, r=20, t=30, b=20),
)

fig.show()

In [35]:
import plotly.graph_objects as go

# Sample Data
labels = ["Google", "OpenAI", "Meta", 
          "Algorithm A", "Algorithm B", "Algorithm C", 
          "Algorithm D", "Algorithm E",
          "Algorithm F"]

parents = ["", "", "", 
           "Google", "Google", "Google", 
           "OpenAI", "OpenAI",
           "Meta"]

values = [40, 20, 20,   # Company total values
          10, 10, 20,   # Google's algorithm values
          10, 10,       # OpenAI's algorithm values
          20]           # Meta's algorithm value

fig = go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    values=values,
    marker_colors=["blue", "purple", "green"] + ["lightblue"]*3 + ["lightpurple"]*2 + ["lightgreen"],
    textinfo="label+value"
))

## Plot layout
fig.update_layout(
    width=480,
    # height=250,
    height=360,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=30, b=20),
)

fig.show()


# Count occurrence of innovations by citations

Create ID => affiliations mapping

In [36]:
origin_affiliation = {}
for i, row in origins_df.iterrows():
    title = row['Origin title']
    search_title = title.replace(',', '')
    affiliations = row['Origin affiliations']
    affiliations = [affiliation.strip() for affiliation in affiliations.split(';')]
    print(title)

    search_results = Works().search_filter(title=search_title).get()
    if len(search_results) > 0:
        for result in search_results:
            if result['title'].lower() == title.lower():
                print("Matched title: ", result['title'])
                origin_affiliation[result['id']] = affiliations

        # top_result = search_results[0]
        # print("Top result: ", top_result['title'])
        # # Check relevance of result is high enough
        # if len(search_results) > 1:
        #     print("Relevance score: ", top_result['relevance_score'])
        #     # Factor out the number of citations - we want to match on the title
        #     match_score = top_result['relevance_score'] / top_result['cited_by_count']**0.5
        #     print("Match score: ", match_score)
        #     if match_score >= 50:
        #         origin_affiliation[top_result['id']] = affiliations
        # else:
        #     origin_affiliation[top_result['id']] = affiliations
    print()

Attention Is All You Need
Matched title:  Attention is All you Need
Matched title:  Attention Is All You Need
Matched title:  Attention Is All You Need

Scaling Laws for Neural Language Models
Matched title:  Scaling Laws for Neural Language Models

Training Compute-Optimal Large Language Models
Matched title:  Training Compute-Optimal Large Language Models

Improving Language Understanding by Generative Pre-Training

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
Matched title:  BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding

Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate

FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awar

In [37]:
origin_affiliation

{'https://openalex.org/W2963403868': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W2626778328': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W4385245566': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W3001279689': ['Johns Hopkins University', 'OpenAI'],
 'https://openalex.org/W4225591000': ['Google DeepMind'],
 'https://openalex.org/W2896457183': ['Google'],
 'https://openalex.org/W2964308564': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W2133564696': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W4297734170': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W4226278401': ['OpenAI'],
 'https://openalex.org/W2964263543': ['OpenAI', 'DeepMind'],
 'https://openalex.org/W2626804490': ['OpenAI', 'DeepMind'],
 'https://openalex.org/W

Now get a list of notable ML system papers

In [38]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
notable_df = pd.read_csv(data_url + 'NOTABLE%20ML%20SYSTEMS')

In [39]:
notable_df.head()

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Confidence,Abstract,Last Modified
0,Gen-2,Text-to-Video,Video generation,Runway,Industry,Gen-2 authors,2023-12-31,,https://research.runwayml.com/gen2,0.0,...,,,,,,,,Unverified,,2023-08-15 20:44:53
1,Falcon 180B,Language,Language modelling,Technology Innovation Institute,Government,,2023-09-06,Falcon LLM - Falcon 180B,https://falconllm.tii.ae/falcon-180b.html,0.0,...,,,,,,,,Likely,Falcon 180B is a super-powerful language model...,2023-09-06 21:40:15
2,Swift,Robotics,Helicopter driving,Intel Labs,Industry - Academia Collaboration (Industry le...,"Elia Kaufmann, Leonard Bauersfeld, Antonio Loq...",2023-08-30,Champion-level drone racing using deep reinfor...,https://www.nature.com/articles/s41586-023-064...,1.0,...,"50 minutes (training details, page 8)",NVIDIA GeForce RTX 3090,Reinforcement learning,,,,Industry,Likely,First-person view (FPV) drone racing is a tele...,2023-09-06 15:39:52
3,Jais,Language,Language modelling,"Cerebras Systems,Mohamed bin Zayed University ...",Industry - Academia Collaboration (Industry le...,"Neha Sengupta, Sunil Kumar Sahu, Bokang Jia, S...",2023-08-29,Jais and Jais-chat: Arabic-Centric Foundation ...,https://inceptioniai.org/jais/docs/Technicalpa...,0.0,...,2023 June 25 to July 18 = 25 days = 600 hours,,,,,,Industry,Confident,"We introduce Jais and Jais-chat, new state-of-...",2023-09-21 03:56:35
4,Llama 2,Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Louis Martin, Kevin Stone, Peter...",2023-07-18,Llama 2: Open Foundation and Fine-Tuned Chat M...,https://ai.meta.com/research/publications/llam...,55.0,...,Model was trained from January 2023 to July 20...,NVIDIA A100 SXM4 80 GB,Supervised,1620000.0,A100 cost in 2023: $1.10/hour\nTraining time: ...,,Industry,Confident,"In this work, we develop and release Llama 2, ...",2023-10-11 20:29:22


In [40]:
# Ensure date column is in datetime format
notable_df['Publication date'] = pd.to_datetime(notable_df['Publication date'])

In [41]:
# Filter by Language domain since 2021
notable_language_df = notable_df[notable_df['Domain'] == 'Language']
notable_recent_language_df = notable_language_df[notable_df['Publication date'] >= pd.to_datetime('2021-01-01')]
notable_recent_language_df


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Confidence,Abstract,Last Modified
1,Falcon 180B,Language,Language modelling,Technology Innovation Institute,Government,,2023-09-06,Falcon LLM - Falcon 180B,https://falconllm.tii.ae/falcon-180b.html,0.0,...,,,,,,,,Likely,Falcon 180B is a super-powerful language model...,2023-09-06 21:40:15
3,Jais,Language,Language modelling,"Cerebras Systems,Mohamed bin Zayed University ...",Industry - Academia Collaboration (Industry le...,"Neha Sengupta, Sunil Kumar Sahu, Bokang Jia, S...",2023-08-29,Jais and Jais-chat: Arabic-Centric Foundation ...,https://inceptioniai.org/jais/docs/Technicalpa...,0.0,...,2023 June 25 to July 18 = 25 days = 600 hours,,,,,,Industry,Confident,"We introduce Jais and Jais-chat, new state-of-...",2023-09-21 03:56:35
4,Llama 2,Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Louis Martin, Kevin Stone, Peter...",2023-07-18,Llama 2: Open Foundation and Fine-Tuned Chat M...,https://ai.meta.com/research/publications/llam...,55.0,...,Model was trained from January 2023 to July 20...,NVIDIA A100 SXM4 80 GB,Supervised,1620000.0,A100 cost in 2023: $1.10/hour\nTraining time: ...,,Industry,Confident,"In this work, we develop and release Llama 2, ...",2023-10-11 20:29:22
5,Claude 2,Language,Language modelling,Anthropic,Industry,,2023-07-11,,https://www.anthropic.com/index/claude-2,0.0,...,,,,,,,,Speculative,,2023-10-23 14:32:28
6,InternLM,Language,Language modelling,"Shanghai AI Lab,SenseTime",Academia,,2023-07-06,,https://internlm.org/,0.0,...,Training performance for the open-source Inter...,NVIDIA A100 SXM4 80 GB,,,,,,Speculative,Pre-training a bilingual 100B Foundation model...,2023-09-19 16:39:25
8,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,0.0,...,,,,,,,,Unverified,,2023-09-19 16:10:40
9,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,0.0,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-09-19 16:10:42
12,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,111.0,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-09-19 16:10:59
13,Vicuna-13B,Language,,Large Model Systems Organization,Academia,,2023-03-30,Vicuna: An Open-Source Chatbot Impressing GPT-...,https://lmsys.org/blog/2023-03-30-vicuna/,0.0,...,,,,259.0,"$300 in 2020, adjusted for inflation using BLS...",,Academia,Speculative,,2023-09-20 21:22:15
14,Falcon-40B,Language,Language modelling,Technology Innovation Institute,Government,,2023-03-15,Abu Dhabi-based Technology Innovation Institut...,https://www.tii.ae/news/abu-dhabi-based-techno...,0.0,...,"""Falcon-40B was trained on AWS SageMaker, on 3...",NVIDIA A100,,,,,Academia,Confident,,2023-09-20 21:22:05


In [42]:
for i, row in notable_recent_language_df.iterrows():
    print(row['System'], row['Link'])

Falcon 180B https://falconllm.tii.ae/falcon-180b.html
Jais https://inceptioniai.org/jais/docs/Technicalpaper.pdf
Llama 2 https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/
Claude 2 https://www.anthropic.com/index/claude-2
InternLM https://internlm.org/
ERNIE 3.5 http://research.baidu.com/Blog/index-view?id=185
Inflection-1 https://inflection.ai/assets/Inflection-1.pdf
PaLM 2 https://ai.google/static/documents/palm2techreport.pdf
Vicuna-13B https://lmsys.org/blog/2023-03-30-vicuna/
Falcon-40B https://www.tii.ae/news/abu-dhabi-based-technology-innovation-institute-introduces-falcon-llm-foundational-large
Claude https://www.anthropic.com/index/introducing-claude
ALM 1.0 https://github.com/FlagAI-Open/FlagAI/blob/master/examples/ALM/README.md
GPT-3.5 (text-davinci-003) https://platform.openai.com/docs/models/gpt-3-5
Galactica https://galactica.org/static/paper.pdf
BLOOM https://huggingface.co/bigscience/bloom
GLM-130B https://keg.cs.tsinghua.edu.c

In [43]:
notable_recent_language_df = notable_recent_language_df.dropna(subset=['Reference'])

In [44]:
notable_recent_language_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Confidence,Abstract,Last Modified
1,Falcon 180B,Language,Language modelling,Technology Innovation Institute,Government,,2023-09-06,Falcon LLM - Falcon 180B,https://falconllm.tii.ae/falcon-180b.html,0.0,...,,,,,,,,Likely,Falcon 180B is a super-powerful language model...,2023-09-06 21:40:15
3,Jais,Language,Language modelling,"Cerebras Systems,Mohamed bin Zayed University ...",Industry - Academia Collaboration (Industry le...,"Neha Sengupta, Sunil Kumar Sahu, Bokang Jia, S...",2023-08-29,Jais and Jais-chat: Arabic-Centric Foundation ...,https://inceptioniai.org/jais/docs/Technicalpa...,0.0,...,2023 June 25 to July 18 = 25 days = 600 hours,,,,,,Industry,Confident,"We introduce Jais and Jais-chat, new state-of-...",2023-09-21 03:56:35
4,Llama 2,Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Louis Martin, Kevin Stone, Peter...",2023-07-18,Llama 2: Open Foundation and Fine-Tuned Chat M...,https://ai.meta.com/research/publications/llam...,55.0,...,Model was trained from January 2023 to July 20...,NVIDIA A100 SXM4 80 GB,Supervised,1620000.0,A100 cost in 2023: $1.10/hour\nTraining time: ...,,Industry,Confident,"In this work, we develop and release Llama 2, ...",2023-10-11 20:29:22
8,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,0.0,...,,,,,,,,Unverified,,2023-09-19 16:10:40
9,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,0.0,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-09-19 16:10:42
12,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,111.0,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-09-19 16:10:59
13,Vicuna-13B,Language,,Large Model Systems Organization,Academia,,2023-03-30,Vicuna: An Open-Source Chatbot Impressing GPT-...,https://lmsys.org/blog/2023-03-30-vicuna/,0.0,...,,,,259.0,"$300 in 2020, adjusted for inflation using BLS...",,Academia,Speculative,,2023-09-20 21:22:15
14,Falcon-40B,Language,Language modelling,Technology Innovation Institute,Government,,2023-03-15,Abu Dhabi-based Technology Innovation Institut...,https://www.tii.ae/news/abu-dhabi-based-techno...,0.0,...,"""Falcon-40B was trained on AWS SageMaker, on 3...",NVIDIA A100,,,,,Academia,Confident,,2023-09-20 21:22:05
16,Claude,Language,Language modelling,Anthropic,Industry,,2023-03-14,Introducing Claude,https://www.anthropic.com/index/introducing-cl...,0.0,...,,,Reinforcement learning,,,,,Unverified,Claude is a next-generation AI assistant based...,2023-09-20 21:22:57
19,ALM 1.0,Language,Language modelling,BAAI,Academia,,2022-11-28,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,0.0,...,,,,,,,,Speculative,,2023-09-19 16:11:29


In [45]:
notable_recent_language_works = []
for i, row in notable_recent_language_df.iterrows():
    title = row['Reference']
    print('Reference:', title)
    # Remove commas to avoid issues with query parsing
    search_title = title.replace(',', '')
    search_results = Works().search_filter(title=search_title).get()
    if len(search_results) > 0:
        for result in search_results:
            if result['title'].lower() == title.lower():
                print("Matched title: ", result['title'])
                notable_recent_language_works.append(result)

        # top_result = search_results[0]
        # print("Top result: ", top_result['title'])
        # # Check relevance of result is high enough
        # if len(search_results) > 1 and top_result.get('relevance_score') is not None:
        #     print("Relevance score: ", top_result['relevance_score'])
        #     # Factor out the number of citations - we want to match on the title
        #     match_score = top_result['relevance_score'] / top_result['cited_by_count']**0.5
        #     print("Match score: ", match_score)
        #     if match_score >= 50:
        #         notable_recent_language_works.append(top_result)
        # else:
        #     notable_recent_language_works.append(top_result)
    print()

Reference: Falcon LLM - Falcon 180B

Reference: Jais and Jais-chat: Arabic-Centric Foundation and Instruction-Tuned Open Generative Large Language Models

Reference: Llama 2: Open Foundation and Fine-Tuned Chat Models
Matched title:  Llama 2: Open Foundation and Fine-Tuned Chat Models

Reference: Introducing ERNIE 3.5: Baidu’s Knowledge-Enhanced Foundation Model Takes a Giant Leap Forward

Reference: Inflection-1 technical memo

Reference: PaLM 2 Technical Report
Matched title:  PaLM 2 Technical Report

Reference: Vicuna: An Open-Source Chatbot Impressing GPT-4 with 90%* ChatGPT Quality

Reference: Abu Dhabi-based Technology Innovation Institute Introduces Falcon LLM: Foundational Large Language Model (LLM) outperforms GPT-3 with 40 Billion Parameters

Reference: Introducing Claude

Reference: ALM 1.0

Reference: Galactica: A Large Language Model for Science
Matched title:  Galactica: A Large Language Model for Science

Reference: BigScience Large Open-science Open-access Multilingual 

QueryError:  Insights from Training Gopher is not a valid parameter. Valid parameters are: cursor, filter, format, group_by, group-by, group_bys, group-bys, mailto, page, per_page, per-page, q, sample, seed, search, select, sort.

In [None]:
print(len(notable_recent_language_works), 'works')
for work in notable_recent_language_works:
    print(work['display_name'])

27 works
Llama 2: Open Foundation and Fine-Tuned Chat Models
PaLM 2 Technical Report
No Language Left Behind: Scaling Human-Centered Machine Translation
Solving Quantitative Reasoning Problems with Language Models
Efficient Language Modeling with Sparse all-MLP
PaLM: Scaling Language Modeling with Pathways
Training Compute-Optimal Large Language Models
LaMDA: Language Models for Dialog Applications
GPT-NeoX-20B: An Open-Source Autoregressive Language Model
GPT-NeoX-20B: An Open-Source Autoregressive Language Model
Competition-Level Code Generation with AlphaCode
Training language models to follow instructions with human feedback
Few-shot Learning with Multilingual Language Models
PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation
Finetuned Language Models Are Zero-Shot Learners
Finetuned Language Models Are Zero-Shot Learners
Larger-Scale Transformers for Multilingual Masked Language Modeling
Larger-Scale Transformers for Multilingual Masked Language Modeling
Large

In [None]:
algorithm_occurences = defaultdict(list)

- For each work in `notable_works`
  - Fetch the list of references
  - For each reference
    - If its ID is in `algorithm_origin_works`
    - Append the ID of the work to the occurrence dict

In [None]:
for work in notable_recent_language_works:
    print(work['display_name'])
    # Fetch the list of references
    references = work['referenced_works']
    if len(references) == 0:
        print('No references found')
        continue
    for reference_work_id in references:
        if reference_work_id in origin_affiliation.keys():
            algorithm_occurences[reference_work_id].append(work['id'])

Llama 2: Open Foundation and Fine-Tuned Chat Models
No references found
PaLM 2 Technical Report
No references found
No Language Left Behind: Scaling Human-Centered Machine Translation
No references found
Solving Quantitative Reasoning Problems with Language Models
No references found
Efficient Language Modeling with Sparse all-MLP
No references found
PaLM: Scaling Language Modeling with Pathways
No references found
Training Compute-Optimal Large Language Models
No references found
LaMDA: Language Models for Dialog Applications
No references found
GPT-NeoX-20B: An Open-Source Autoregressive Language Model
No references found
GPT-NeoX-20B: An Open-Source Autoregressive Language Model
No references found
Competition-Level Code Generation with AlphaCode
No references found
Training language models to follow instructions with human feedback
No references found
Few-shot Learning with Multilingual Language Models
No references found
PLATO-XL: Exploring the Large-scale Pre-training of Dialogue

In [None]:
algorithm_occurences

defaultdict(list,
            {'https://openalex.org/W2896457183': ['https://openalex.org/W3200128700'],
             'https://openalex.org/W2964121744': ['https://openalex.org/W3200128700',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W3001279689': ['https://openalex.org/W3200128700',
              'https://openalex.org/W3177813494',
              'https://openalex.org/W3023786569',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W2963403868': ['https://openalex.org/W3185293939',
              'https://openalex.org/W3159134453',
              'https://openalex.org/W3023786569',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W1522301498': ['https://openalex.org/W3169320628',
              'https://openalex.org/W3023786569'],
             'https://openalex.org/W2626778328': ['https://openalex.org/W3177813494',
              'https://openalex.org/W3153553004'],
          

In [None]:
for algorithm_id, occurrence_ids in algorithm_occurences.items():
    print(Works()[algorithm_id]['display_name'])
    print([Works()[occurrence_id]['display_name'] for occurrence_id in occurrence_ids])
    print()

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation']

Adam: A Method for Stochastic Optimization
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation', 'Recipes for Building an Open-Domain Chatbot']

Scaling Laws for Neural Language Models
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation', 'Evaluating Large Language Models Trained on Code', 'Recipes for building an open-domain chatbot', 'Recipes for Building an Open-Domain Chatbot']

Attention is All you Need
['Larger-Scale Transformers for Multilingual Masked Language Modeling', 'Larger-Scale Transformers for Multilingual Masked Language Modeling', 'Recipes for building an open-domain chatbot', 'Recipes for Building an Open-Domain Chatbot']

Adam: A Method for Stochastic Optimization
['HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units', 'Recipes 

In [None]:
w = Works().search_filter(title="Attention is all you need")
tmp = w.get()
tmp

[{'id': 'https://openalex.org/W2963403868',
  'doi': None,
  'title': 'Attention is All you Need',
  'display_name': 'Attention is All you Need',
  'relevance_score': 13173.425,
  'publication_year': 2017,
  'publication_date': '2017-06-12',
  'ids': {'openalex': 'https://openalex.org/W2963403868', 'mag': '2963403868'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://arxiv.org/pdf/1706.03762v5',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S4306400194',
    'display_name': 'arXiv (Cornell University)',
    'issn_l': None,
    'issn': None,
    'is_oa': True,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/I205783295',
    'host_organization_name': 'Cornell University',
    'host_organization_lineage': ['https://openalex.org/I205783295'],
    'host_organization_lineage_names': ['Cornell University'],
    'type': 'repository'},
   'license': None,
   'version': None,
   'is_accepted': False,
   'is_publis