In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
from datetime import datetime
import os
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import pyalex
from pyalex import Works

from researcher_impact.plotting import save_plot
from researcher_impact.utils import dict_to_dataarray

In [3]:
# The polite pool has much faster and more consistent response times. To get into the polite pool, you set your email:
pyalex.config.email = "ben@epochai.org"

In [4]:
pio.templates.default = "plotly_white"

In [5]:
result_file_location = 'results/'
os.makedirs(result_file_location, exist_ok=True)

In [111]:
# Download dataset
sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
tab_id = '578731623'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
origins_df = pd.read_csv(data_url)

In [112]:
origins_df.head()

Unnamed: 0,Algorithm,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Transformer,Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
1,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
2,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
3,Causal objective,Improving Language Understanding by Generative...,https://openai.com/research/language-unsupervised,2018-Jun-11,OpenAI,"Alec Radford, Karthik Narasimhan, Tim Salimans..."
4,Masked/denoising objective,BERT: Pre-training of Deep Bidirectional Trans...,https://arxiv.org/abs/1810.04805,2018-Oct-11,Google,"Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kris..."


In [113]:
origins_df.dropna(subset=['Origin affiliations'], inplace=True)
origins_df.head()

Unnamed: 0,Algorithm,Origin title,Origin link,Origin publication date,Origin affiliations,Origin authors
0,Transformer,Attention Is All You Need,https://arxiv.org/abs/1706.03762,2017-Jun-12,Google Brain; Google Research; University of T...,"Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak..."
1,Kaplan et al. scaling laws,Scaling Laws for Neural Language Models,https://arxiv.org/abs/2001.08361,2020-Jan-23,Johns Hopkins University; OpenAI,"Jared Kaplan, Sam McCandlish, Tom Henighan, To..."
2,Hoffmann et al. scaling laws,Training Compute-Optimal Large Language Models,https://arxiv.org/abs/2203.15556,2022-Mar-29,Google DeepMind,"Jordan Hoffmann, Sebastian Borgeaud, Arthur Me..."
3,Causal objective,Improving Language Understanding by Generative...,https://openai.com/research/language-unsupervised,2018-Jun-11,OpenAI,"Alec Radford, Karthik Narasimhan, Tim Salimans..."
4,Masked/denoising objective,BERT: Pre-training of Deep Bidirectional Trans...,https://arxiv.org/abs/1810.04805,2018-Oct-11,Google,"Jacob Devlin, Ming-Wei Chang, Kenton Lee, Kris..."


Create institution => origins mapping

In [158]:
institution_aliases = {
    'Google': 'Google',
    'Google Brain': 'Google',
    'Google Research': 'Google',
    'DeepMind': 'DeepMind',
    'Google DeepMind': 'DeepMind',
    'OpenAI': 'OpenAI',
    'Baidu Research': 'Baidu',
    'NVIDIA': 'NVIDIA',
    'Facebook AI Research': 'Meta',
    'Zhuiyi Technology Co., Ltd.': 'Zhuiyi',
}

In [159]:
institution_key_algorithms = defaultdict(list)
for i, row in origins_df.iterrows():
    algorithm_name = row['Algorithm']
    affiliations = row['Origin affiliations']
    affiliations = [affiliation.strip() for affiliation in affiliations.split(';')]
    for affiliation in affiliations:
        if institution_aliases.get(affiliation) is not None:
            alias = institution_aliases[affiliation]
            institution_key_algorithms[alias].append(algorithm_name)
institution_key_algorithms

defaultdict(list,
            {'Google': ['Transformer',
              'Transformer',
              'Masked/denoising objective',
              'Chain-of-thought',
              'Decoder-only architecture',
              'LayerNorm',
              'Sinusoidal position embeddings',
              'Sinusoidal position embeddings',
              'Relative position encodings',
              'SwiGLU activation',
              'Sparsely-Gated Mixture-of-Experts layer (MoE)',
              'Multi-Query Attention',
              'Grouped Query Attention',
              'Dynamic batch size'],
             'OpenAI': ['Kaplan et al. scaling laws',
              'Causal objective',
              'Instruction tuning',
              'RLHF',
              'PPO',
              'Sparse Attention'],
             'DeepMind': ['Hoffmann et al. scaling laws', 'RLHF'],
             'Baidu': ['Mixed precision training'],
             'NVIDIA': ['Mixed precision training'],
             'Meta': ['Pre-normaliza

In [160]:
institution_key_algorithms_count = dict_to_dataarray(institution_key_algorithms, dim='institution', val_fn=len)
institution_key_algorithms_count

In [161]:
fig = go.Figure(data=[
    go.Bar(
        name='Key innovations',
        x=institution_key_algorithms_count.institution,
        y=institution_key_algorithms_count
    ),
])

## Plot layout
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.update_layout(
    # title='Initial ranking of companies leading in AI research',
    # xaxis_title='Company',
    yaxis_title='Number of innovations for LLMs',
)
fig.update_layout(
    legend=dict(
        yanchor="top",
        y=0.99,
        xanchor="right",
        x=0.99,
    ),
)
fig.update_layout(
    autosize=False,
    width=400,
    height=300,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=20, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'num_key_innovations')

## Show plot
fig.show()

# Count occurrence of innovations directly

In [118]:
# Download dataset
sheet_id = '1L_j7OaX19HXWWIx_apKvWo2OteY1XOB7FamaLEd_p0s'
tab_id = '1765093800'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/export?gid={tab_id}&format=csv'
occurrences_df = pd.read_csv(data_url, index_col='Algorithm')

In [119]:
occurrences_df.head()

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,,https://ai.google/static/documents/palm2techre...,https://arxiv.org/abs/2206.14858,https://arxiv.org/abs/2204.02311,https://arxiv.org/abs/2201.11990,https://ai.meta.com/research/publications/llam...,https://arxiv.org/abs/2112.11446,https://arxiv.org/abs/2203.15556,https://arxiv.org/abs/2302.13971,https://arxiv.org/abs/2205.01068,...,,,,https://arxiv.org/abs/2208.01448,https://huggingface.co/bigscience/bloom,https://research.facebook.com/publications/no-...,https://arxiv.org/abs/1909.08053,https://cdn.openai.com/better-language-models/...,https://cdn.openai.com/research-covers/languag...,https://proceedings.neurips.cc/paper_files/pap...
Transformer,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Kaplan et al. scaling laws,,0,1,1,1,0,1,0,0,1,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,0,0,0,1,0,1,1,0,...,,,,1,0,0,,0,0,0
Causal objective,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1


In [120]:
occurrences_df = occurrences_df[occurrences_df.index.notna()]
occurrences_df

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Transformer,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Kaplan et al. scaling laws,,0,1,1,1,0,1,0,0,1,...,,,,0,1,0,,0,0,0
Hoffmann et al. scaling laws,,1,0,0,0,1,0,1,1,0,...,,,,1,0,0,,0,0,0
Causal objective,,1,1,1,1,1,1,1,1,1,...,,,,1,1,1,,1,1,1
Masked/denoising objective,,1,0,0,0,0,0,0,0,0,...,,,,1,0,1,,0,0,0
FlashAttention,,0,0,0,0,1,0,0,1,0,...,,,,0,0,0,,0,0,0
Instruction tuning,,1,0,0,0,1,0,0,1,0,...,,,,0,0,0,,0,0,0
RLHF,,0,0,0,0,1,0,0,0,0,...,,,,0,0,0,,0,0,0
PPO,,0,0,0,0,1,0,0,0,0,...,,,,0,0,0,,0,0,0
Chain-of-thought,,1,1,1,0,0,0,0,0,0,...,,,,1,0,0,,0,0,0


In [121]:
# Replace '?' values with 0
occurrences_df = occurrences_df.replace('?', 0)
occurrences_df

Unnamed: 0_level_0,GPT-4,PaLM 2,Minerva (540B),PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,...,Falcon-40B,YaLM,ALIGN,AlexaTM 20B,BLOOM (176B),NLLB,Megatron-LM,GPT-2,GPT,Transformer
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Transformer,,1.0,1,1,1,1,1,1,1,1,...,,,,1.0,1.0,1.0,,1,1,1
Kaplan et al. scaling laws,,0.0,1,1,1,0,1,0,0,1,...,,,,0.0,1.0,0.0,,0,0,0
Hoffmann et al. scaling laws,,1.0,0,0,0,1,0,1,1,0,...,,,,1.0,0.0,0.0,,0,0,0
Causal objective,,1.0,1,1,1,1,1,1,1,1,...,,,,1.0,1.0,1.0,,1,1,1
Masked/denoising objective,,1.0,0,0,0,0,0,0,0,0,...,,,,1.0,0.0,1.0,,0,0,0
FlashAttention,,0.0,0,0,0,1,0,0,1,0,...,,,,0.0,0.0,0.0,,0,0,0
Instruction tuning,,1.0,0,0,0,1,0,0,1,0,...,,,,0.0,0.0,0.0,,0,0,0
RLHF,,0.0,0,0,0,1,0,0,0,0,...,,,,0.0,0.0,0.0,,0,0,0
PPO,,0.0,0,0,0,1,0,0,0,0,...,,,,0.0,0.0,0.0,,0,0,0
Chain-of-thought,,1.0,1,1,0,0,0,0,0,0,...,,,,1.0,0.0,0.0,,0,0,0


In [122]:
# Filter columns
keep_systems = ['Algorithm', 'PaLM (540B)', 'Megatron-Turing NLG (530B)', 'LLaMA 2 (70B)', 'Gopher (280B)', 'Chinchilla (70B)', 'LLaMA (65B)', 'OPT-175B', 'Yuan 1.0', 'AlphaCode', 'GPT-3 (175B)']
selected_systems_occurrences_df = occurrences_df.filter(keep_systems)
selected_systems_occurrences_df

Unnamed: 0_level_0,PaLM (540B),Megatron-Turing NLG (530B),LLaMA 2 (70B),Gopher (280B),Chinchilla (70B),LLaMA (65B),OPT-175B,Yuan 1.0,AlphaCode,GPT-3 (175B)
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Transformer,1,1,1,1,1,1,1,1,1,1
Kaplan et al. scaling laws,1,1,0,1,0,0,1,0,0,1
Hoffmann et al. scaling laws,0,0,1,0,1,1,0,0,0,0
Causal objective,1,1,1,1,1,1,1,1,1,1
Masked/denoising objective,0,0,0,0,0,0,0,0,1,0
FlashAttention,0,0,1,0,0,1,0,0,0,0
Instruction tuning,0,0,1,0,0,1,0,0,0,0
RLHF,0,0,1,0,0,0,0,0,0,0
PPO,0,0,1,0,0,0,0,0,0,0
Chain-of-thought,1,0,0,0,0,0,0,0,0,0


In [123]:
# Make values integers
selected_systems_occurrences_df = selected_systems_occurrences_df.astype(int)

In [124]:
# Sum each row
total_occurrences_by_innovation = selected_systems_occurrences_df.sum(axis=1)
total_occurrences_by_innovation

Algorithm
Transformer                                         10
Kaplan et al. scaling laws                           5
Hoffmann et al. scaling laws                         3
Causal objective                                    10
Masked/denoising objective                           1
FlashAttention                                       2
Instruction tuning                                   2
RLHF                                                 1
PPO                                                  1
Chain-of-thought                                     1
Mixed precision training                             2
Decoder-only architecture                            7
RMSNorm                                              4
LayerNorm                                            6
Pre-normalization                                    9
Learnable position embeddings                        4
Sinusoidal position embeddings                       1
Relative position encodings                          2


In [125]:
len(origins_df[origins_df['Algorithm'] == 'GELU'])

0

In [142]:
institution_key_algorithm_occurrences = defaultdict(dict)
for innovation, occurrence_count in total_occurrences_by_innovation.items():
    if occurrence_count == 0:
        continue
    print(innovation)
    matching_origin = origins_df[origins_df['Algorithm'] == innovation]
    if len(matching_origin) == 0:
        continue
    origin_row_number = matching_origin.index[0]
    origin_affiliations = origins_df.loc[origin_row_number]['Origin affiliations']
    origin_affiliations = [affiliation.strip() for affiliation in origin_affiliations.split(';')]
    print(origin_affiliations)
    for affiliation in origin_affiliations:
        alias = institution_aliases.get(affiliation)
        print(alias)
        if alias is not None:
            institution_key_algorithm_occurrences[innovation][alias] = occurrence_count
        else:
            print(f'No alias for {affiliation}')
    print(institution_key_algorithm_occurrences)
    print()

institution_key_algorithm_occurrences

Transformer
['Google Brain', 'Google Research', 'University of Toronto']
Google
Google
None
No alias for University of Toronto
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}})

Kaplan et al. scaling laws
['Johns Hopkins University', 'OpenAI']
None
No alias for Johns Hopkins University
OpenAI
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}})

Hoffmann et al. scaling laws
['Google DeepMind']
DeepMind
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}, 'Hoffmann et al. scaling laws': {'DeepMind': 3}})

Causal objective
['OpenAI']
OpenAI
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5}, 'Hoffmann et al. scaling laws': {'DeepMind': 3}, 'Causal objective': {'OpenAI': 10}})

Masked/denoising objective
['Google']
Google
defaultdict(<class 'dict'>, {'Transformer': {'Google': 10}, 'Kaplan et al. scaling laws': {'OpenAI': 5

defaultdict(dict,
            {'Transformer': {'Google': 10},
             'Kaplan et al. scaling laws': {'OpenAI': 5},
             'Hoffmann et al. scaling laws': {'DeepMind': 3},
             'Causal objective': {'OpenAI': 10},
             'Masked/denoising objective': {'Google': 1},
             'Instruction tuning': {'OpenAI': 2},
             'RLHF': {'OpenAI': 1, 'DeepMind': 1},
             'PPO': {'OpenAI': 1},
             'Chain-of-thought': {'Google': 1},
             'Mixed precision training': {'Baidu': 2, 'NVIDIA': 2},
             'Decoder-only architecture': {'Google': 7},
             'LayerNorm': {'Google': 6},
             'Pre-normalization': {'Meta': 9},
             'Learnable position embeddings': {'Meta': 4},
             'Sinusoidal position embeddings': {'Google': 1},
             'Relative position encodings': {'Google': 2},
             'Rotary position embeddings': {'Zhuiyi': 3},
             'SwiGLU activation': {'Google': 3},
             'Sparse Attent

In [168]:
# Generate the stacked bar chart
fig = go.Figure()

annotations = []
company_count_stack = defaultdict(int)

# Add bar traces
for innovation, company_counts in sorted(institution_key_algorithm_occurrences.items(), key=lambda x: min(x[1].values()), reverse=True):
    y_values = list(company_counts.values())
    x_values = list(company_counts.keys())
    fig.add_trace(go.Bar(name=innovation, x=x_values, y=y_values, marker=dict(color='blue', opacity=0.6)))

    for x, y in zip(x_values, y_values):
        # if innovation in ['Transformer']:
        start_y = company_count_stack[x]
        end_y = start_y + y
        middle_y = (start_y + end_y) / 2
        if y > 5:  # Choose a threshold to decide if the segment is too short for text
            annotations.append(dict(x=x, y=middle_y, text=innovation, showarrow=False, font=dict(size=6)))
        company_count_stack[x] += y

## Plot layout
fig.update_layout(
    barmode='stack',
    xaxis={'categoryorder':'total descending'},
    title='Occurrence of innovations in the top 10 largest LMs',
    xaxis_title='Company responsible for innovation',
    yaxis_title='Number of occurrences',
    showlegend=False,
    # annotations=annotations,
    autosize=False,
    width=400,
    height=300,
    title_x=0.5,
    font=dict(size=10),
    margin=dict(l=20, r=20, t=30, b=20),
)

## Save plot
save_plot(fig, result_file_location, 'key_innovations_occurrence')

# Show the figure
fig.show()

# Count occurrence of innovations by citations

Create ID => affiliations mapping

In [50]:
origin_affiliation = {}
for i, row in origins_df.iterrows():
    title = row['Origin title']
    search_title = title.replace(',', '')
    affiliations = row['Origin affiliations']
    affiliations = [affiliation.strip() for affiliation in affiliations.split(';')]
    print(title)

    search_results = Works().search_filter(title=search_title).get()
    if len(search_results) > 0:
        for result in search_results:
            if result['title'].lower() == title.lower():
                print("Matched title: ", result['title'])
                origin_affiliation[result['id']] = affiliations

        # top_result = search_results[0]
        # print("Top result: ", top_result['title'])
        # # Check relevance of result is high enough
        # if len(search_results) > 1:
        #     print("Relevance score: ", top_result['relevance_score'])
        #     # Factor out the number of citations - we want to match on the title
        #     match_score = top_result['relevance_score'] / top_result['cited_by_count']**0.5
        #     print("Match score: ", match_score)
        #     if match_score >= 50:
        #         origin_affiliation[top_result['id']] = affiliations
        # else:
        #     origin_affiliation[top_result['id']] = affiliations
    print()

Attention Is All You Need
Matched title:  Attention is All you Need
Matched title:  Attention Is All You Need
Matched title:  Attention Is All You Need

Scaling Laws for Neural Language Models
Matched title:  Scaling Laws for Neural Language Models

Training Compute-Optimal Large Language Models
Matched title:  Training Compute-Optimal Large Language Models

Improving Language Understanding by Generative Pre-Training

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
Matched title:  BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding

Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate
Matched title:  Neural Machine Translation by Jointly Learning to Align and Translate

FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awar

In [51]:
origin_affiliation

{'https://openalex.org/W2963403868': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W2626778328': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W4385245566': ['Google Brain',
  'Google Research',
  'University of Toronto'],
 'https://openalex.org/W3001279689': ['Johns Hopkins University', 'OpenAI'],
 'https://openalex.org/W4225591000': ['Google DeepMind'],
 'https://openalex.org/W2896457183': ['Google'],
 'https://openalex.org/W2964308564': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W2133564696': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W4297734170': ['Jacobs University Bremen, Germany',
  'University of Montreal'],
 'https://openalex.org/W4226278401': ['OpenAI'],
 'https://openalex.org/W2964263543': ['OpenAI', 'DeepMind'],
 'https://openalex.org/W2626804490': ['OpenAI', 'DeepMind'],
 'https://openalex.org/W

Now get a list of notable ML system papers

In [52]:
# Download dataset from the Parameters, Compute and Data Trends in ML sheet
sheet_id = '1AAIebjNsnJj_uKALHbXNfn3_YsT6sHXtCU0q7OIPuc4'
data_url = f'https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet='
notable_df = pd.read_csv(data_url + 'NOTABLE%20ML%20SYSTEMS')

In [53]:
notable_df.head()

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
0,Gen-2,Text-to-Video,Video generation,Runway,Industry,,2023-12-31,,https://research.runwayml.com/gen2,0.0,...,,,,,,,,Unverified,,2023-08-15 20:28:23
1,InternLM,Language,Language modelling,"Shanghai AI Lab,SenseTime",Academia,,2023-07-06,,https://internlm.org/,,...,Training performance for the open-source Inter...,NVIDIA A100 SXM4 80 GB,,,,,,Speculative,Pre-training a bilingual 100B Foundation model...,2023-08-15 20:04:54
2,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,,...,,,,,,,,Unverified,,2023-07-05 16:08:00
3,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-06-27 15:14:23
4,RoboCat,Robotics,,"Google DeepMind,Google",Industry,"Konstantinos Bousmalis, Giulia Vezzani, Dushya...",2023-06-20,RoboCat: A Self-Improving Foundation Agent for...,https://arxiv.org/abs/2306.11706,,...,,,,,,,Industry,Speculative,The ability to leverage heterogeneous robotic ...,2023-08-10 15:22:11


In [54]:
# Ensure date column is in datetime format
notable_df['Publication date'] = pd.to_datetime(notable_df['Publication date'])

In [55]:
# Filter by Language domain since 2021
notable_language_df = notable_df[notable_df['Domain'] == 'Language']
notable_recent_language_df = notable_language_df[notable_df['Publication date'] >= pd.to_datetime('2021-01-01')]
notable_recent_language_df


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
1,InternLM,Language,Language modelling,"Shanghai AI Lab,SenseTime",Academia,,2023-07-06,,https://internlm.org/,,...,Training performance for the open-source Inter...,NVIDIA A100 SXM4 80 GB,,,,,,Speculative,Pre-training a bilingual 100B Foundation model...,2023-08-15 20:04:54
2,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,,...,,,,,,,,Unverified,,2023-07-05 16:08:00
3,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-06-27 15:14:23
5,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-08-10 15:21:27
7,LLaMA (65B),Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Thibaut Lavril, Gautier Izacard,...",2023-02-24,LLaMA: Open and Efficient Foundation Language ...,https://arxiv.org/abs/2302.13971,702.0,...,"""When training a 65B-parameter model, our code...",NVIDIA A100,Supervised,1179384.75,1023384 processor-hours on A100 GPUs. May 2023...,,Industry,Likely,"We introduce LLaMA, a collection of foundation...",2023-07-28 16:26:34
9,ALM 1.0,Language,Language modelling,BAAI,Academia,,2022-11-28,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-08-15 17:42:35
10,BLOOM,Language,Language model,"Hugging Face,BigScience",Research Collective,"Margaret Mitchell, Giada Pistilli, Yacine Jern...",2022-11-08,BigScience Large Open-science Open-access Mult...,https://huggingface.co/bigscience/bloom,,...,,,Self-supervised learning,,,Yes,,,,2023-08-04 13:13:07
14,AlexaTM 20B,Language,Language modelling,Amazon,Industry,"Saleh Soltan, Shankar Ananthakrishnan, Jack Fi...",2022-08-02,AlexaTM 20B: Few-Shot Learning Using a Large-S...,https://arxiv.org/abs/2208.01448,,...,"See p.5 of the paper: ""We trained AlexaTM 20B ...",,,,,,Industry,,"In this work, we demonstrate that multilingual...",2023-06-08 00:39:43
15,NLLB,Language,Translation,Meta AI,Industry,"Marta R. Costa-jussà, James Cross, Onur Çelebi...",2022-07-06,No Language Left Behind: Scaling Human-Centere...,https://research.facebook.com/publications/no-...,19.0,...,,,,39175.64,,Yes,Industry,,Driven by the goal of eradicating language bar...,2023-05-29 20:51:04
16,Minerva (540B),Language,Quantitative Reasoning Problems,Google,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,Self-supervised learning,3267257.75,,Yes,Industry,,Language models have achieved remarkable perfo...,2023-08-10 15:22:32


In [56]:
for i, row in notable_recent_language_df.iterrows():
    print(row['System'], row['Link'])

InternLM https://internlm.org/
ERNIE 3.5 http://research.baidu.com/Blog/index-view?id=185
Inflection-1 https://inflection.ai/assets/Inflection-1.pdf
PaLM 2 https://ai.google/static/documents/palm2techreport.pdf
LLaMA (65B) https://arxiv.org/abs/2302.13971
ALM 1.0 https://github.com/FlagAI-Open/FlagAI/blob/master/examples/ALM/README.md
BLOOM https://huggingface.co/bigscience/bloom
AlexaTM 20B https://arxiv.org/abs/2208.01448
NLLB https://research.facebook.com/publications/no-language-left-behind/?utm_source=twitter&utm_medium=organic_social&utm_campaign=nllb&utm_content=os-artifacts
Minerva (540B) https://arxiv.org/abs/2206.14858
UL2 https://arxiv.org/abs/2205.05131v1
Sparse all-MLP https://arxiv.org/abs/2203.06850
PaLM (540B) https://arxiv.org/abs/2204.02311
Chinchilla https://arxiv.org/abs/2203.15556
LaMDA https://arxiv.org/abs/2201.08239
GPT-NeoX-20B https://blog.eleuther.ai/announcing-20b/
AlphaCode https://arxiv.org/pdf/2203.07814.pdf
InstructGPT https://cdn.openai.com/papers/Train

In [57]:
notable_recent_language_df = notable_recent_language_df.dropna(subset=['Reference'])

In [58]:
notable_recent_language_df

Unnamed: 0,System,Domain,Task,Organization,Organization Categorization,Authors,Publication date,Reference,Link,Citations,...,Training time notes,Training hardware,Approach,Training compute cost (2020 USD),Compute cost notes,Self-supervised training,Compute Sponsor Categorization,Epistemic status,Abstract,Last Modified
2,ERNIE 3.5,Language,Language modelling,Baidu,Industry,,2023-06-27,Introducing ERNIE 3.5: Baidu’s Knowledge-Enhan...,http://research.baidu.com/Blog/index-view?id=185,,...,,,,,,,,Unverified,,2023-07-05 16:08:00
3,Inflection-1,Language,Language modelling,Inflection AI,Industry,,2023-06-23,Inflection-1 technical memo,https://inflection.ai/assets/Inflection-1.pdf,,...,,NVIDIA H100 SXM5,,,,,Industry,Speculative,Large language models (LLMs) based on the Tran...,2023-06-27 15:14:23
5,PaLM 2,Language,Language modelling,Google,Industry,"Andrew M. Dai, David R. So, Dmitry Lepikhin, J...",2023-05-10,PaLM 2 Technical Report,https://ai.google/static/documents/palm2techre...,,...,,,,,PaLM 2 was trained on TPU v4 according to the ...,,Industry,,"We introduce PaLM 2, a new state-of-the-art la...",2023-08-10 15:21:27
7,LLaMA (65B),Language,Language modelling,Meta AI,Industry,"Hugo Touvron, Thibaut Lavril, Gautier Izacard,...",2023-02-24,LLaMA: Open and Efficient Foundation Language ...,https://arxiv.org/abs/2302.13971,702.0,...,"""When training a 65B-parameter model, our code...",NVIDIA A100,Supervised,1179384.75,1023384 processor-hours on A100 GPUs. May 2023...,,Industry,Likely,"We introduce LLaMA, a collection of foundation...",2023-07-28 16:26:34
9,ALM 1.0,Language,Language modelling,BAAI,Academia,,2022-11-28,ALM 1.0,https://github.com/FlagAI-Open/FlagAI/blob/mas...,,...,,,,,,,,Speculative,,2023-08-15 17:42:35
10,BLOOM,Language,Language model,"Hugging Face,BigScience",Research Collective,"Margaret Mitchell, Giada Pistilli, Yacine Jern...",2022-11-08,BigScience Large Open-science Open-access Mult...,https://huggingface.co/bigscience/bloom,,...,,,Self-supervised learning,,,Yes,,,,2023-08-04 13:13:07
14,AlexaTM 20B,Language,Language modelling,Amazon,Industry,"Saleh Soltan, Shankar Ananthakrishnan, Jack Fi...",2022-08-02,AlexaTM 20B: Few-Shot Learning Using a Large-S...,https://arxiv.org/abs/2208.01448,,...,"See p.5 of the paper: ""We trained AlexaTM 20B ...",,,,,,Industry,,"In this work, we demonstrate that multilingual...",2023-06-08 00:39:43
15,NLLB,Language,Translation,Meta AI,Industry,"Marta R. Costa-jussà, James Cross, Onur Çelebi...",2022-07-06,No Language Left Behind: Scaling Human-Centere...,https://research.facebook.com/publications/no-...,19.0,...,,,,39175.64,,Yes,Industry,,Driven by the goal of eradicating language bar...,2023-05-29 20:51:04
16,Minerva (540B),Language,Quantitative Reasoning Problems,Google,Industry,"Aitor Lewkowycz, Anders Andreassen, David Doha...",2022-06-29,Solving Quantitative Reasoning Problems with L...,https://arxiv.org/abs/2206.14858,,...,,,Self-supervised learning,3267257.75,,Yes,Industry,,Language models have achieved remarkable perfo...,2023-08-10 15:22:32
21,UL2,Language,,"Google Research,Google Brain",Industry,"Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier...",2022-05-10,Unifying Language Learning Paradigms,https://arxiv.org/abs/2205.05131v1,19.0,...,,,,,,,Industry,,,2023-08-11 19:08:43


In [59]:
notable_recent_language_works = []
for i, row in notable_recent_language_df.iterrows():
    title = row['Reference']
    print('Reference:', title)
    # Remove commas to avoid issues with query parsing
    search_title = title.replace(',', '')
    search_results = Works().search_filter(title=search_title).get()
    if len(search_results) > 0:
        for result in search_results:
            if result['title'].lower() == title.lower():
                print("Matched title: ", result['title'])
                notable_recent_language_works.append(result)

        # top_result = search_results[0]
        # print("Top result: ", top_result['title'])
        # # Check relevance of result is high enough
        # if len(search_results) > 1 and top_result.get('relevance_score') is not None:
        #     print("Relevance score: ", top_result['relevance_score'])
        #     # Factor out the number of citations - we want to match on the title
        #     match_score = top_result['relevance_score'] / top_result['cited_by_count']**0.5
        #     print("Match score: ", match_score)
        #     if match_score >= 50:
        #         notable_recent_language_works.append(top_result)
        # else:
        #     notable_recent_language_works.append(top_result)
    print()

Reference: Introducing ERNIE 3.5: Baidu’s Knowledge-Enhanced Foundation Model Takes a Giant Leap Forward

Reference: Inflection-1 technical memo

Reference: PaLM 2 Technical Report
Matched title:  PaLM 2 Technical Report

Reference: LLaMA: Open and Efficient Foundation Language Models
Matched title:  LLaMA: Open and Efficient Foundation Language Models

Reference: ALM 1.0

Reference: BigScience Large Open-science Open-access Multilingual Language Model

Reference: AlexaTM 20B: Few-Shot Learning Using a Large-Scale Multilingual Seq2Seq Model

Reference: No Language Left Behind: Scaling Human-Centered Machine Translation
Matched title:  No Language Left Behind: Scaling Human-Centered Machine Translation

Reference: Solving Quantitative Reasoning Problems with Language Models
Matched title:  Solving Quantitative Reasoning Problems with Language Models

Reference: Unifying Language Learning Paradigms

Reference: Efficient Language Modeling with Sparse all-MLP
Matched title:  Efficient Lang

In [60]:
print(len(notable_recent_language_works), 'works')
for work in notable_recent_language_works:
    print(work['display_name'])

25 works
PaLM 2 Technical Report
LLaMA: Open and Efficient Foundation Language Models
No Language Left Behind: Scaling Human-Centered Machine Translation
Solving Quantitative Reasoning Problems with Language Models
Efficient Language Modeling with Sparse all-MLP
PaLM: Scaling Language Modeling with Pathways
Training Compute-Optimal Large Language Models
LaMDA: Language Models for Dialog Applications
Competition-Level Code Generation with AlphaCode
Training language models to follow instructions with human feedback
Few-shot Learning with Multilingual Language Models
PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation
Finetuned Language Models Are Zero-Shot Learners
Finetuned Language Models Are Zero-Shot Learners
Larger-Scale Transformers for Multilingual Masked Language Modeling
Larger-Scale Transformers for Multilingual Masked Language Modeling
Larger-Scale Transformers for Multilingual Masked Language Modeling
HuBERT: Self-Supervised Speech Representation Learning

In [61]:
algorithm_occurences = defaultdict(list)

- For each work in `notable_works`
  - Fetch the list of references
  - For each reference
    - If its ID is in `algorithm_origin_works`
    - Append the ID of the work to the occurrence dict

In [62]:
for work in notable_recent_language_works:
    print(work['display_name'])
    # Fetch the list of references
    references = work['referenced_works']
    if len(references) == 0:
        print('No references found')
        continue
    for reference_work_id in references:
        if reference_work_id in origin_affiliation.keys():
            algorithm_occurences[reference_work_id].append(work['id'])

PaLM 2 Technical Report
No references found
LLaMA: Open and Efficient Foundation Language Models
No references found
No Language Left Behind: Scaling Human-Centered Machine Translation
No references found
Solving Quantitative Reasoning Problems with Language Models
No references found
Efficient Language Modeling with Sparse all-MLP
No references found
PaLM: Scaling Language Modeling with Pathways
No references found
Training Compute-Optimal Large Language Models
No references found
LaMDA: Language Models for Dialog Applications
No references found
Competition-Level Code Generation with AlphaCode
No references found
Training language models to follow instructions with human feedback
No references found
Few-shot Learning with Multilingual Language Models
No references found
PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation
Finetuned Language Models Are Zero-Shot Learners
Finetuned Language Models Are Zero-Shot Learners
No references found
Larger-Scale Transformers f

In [63]:
algorithm_occurences

defaultdict(list,
            {'https://openalex.org/W2896457183': ['https://openalex.org/W3200128700'],
             'https://openalex.org/W2964121744': ['https://openalex.org/W3200128700',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W3001279689': ['https://openalex.org/W3200128700',
              'https://openalex.org/W3177813494',
              'https://openalex.org/W3023786569',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W2963403868': ['https://openalex.org/W3185293939',
              'https://openalex.org/W3159134453',
              'https://openalex.org/W3023786569',
              'https://openalex.org/W3155584966'],
             'https://openalex.org/W1522301498': ['https://openalex.org/W3169320628',
              'https://openalex.org/W3023786569'],
             'https://openalex.org/W2626778328': ['https://openalex.org/W3177813494',
              'https://openalex.org/W3153553004'],
          

In [64]:
for algorithm_id, occurrence_ids in algorithm_occurences.items():
    print(Works()[algorithm_id]['display_name'])
    print([Works()[occurrence_id]['display_name'] for occurrence_id in occurrence_ids])
    print()

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation']

Adam: A Method for Stochastic Optimization
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation', 'Recipes for Building an Open-Domain Chatbot']

Scaling Laws for Neural Language Models
['PLATO-XL: Exploring the Large-scale Pre-training of Dialogue Generation', 'Evaluating Large Language Models Trained on Code', 'Recipes for building an open-domain chatbot', 'Recipes for Building an Open-Domain Chatbot']

Attention is All you Need
['Larger-Scale Transformers for Multilingual Masked Language Modeling', 'Larger-Scale Transformers for Multilingual Masked Language Modeling', 'Recipes for building an open-domain chatbot', 'Recipes for Building an Open-Domain Chatbot']

Adam: A Method for Stochastic Optimization
['HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units', 'Recipes 

In [6]:
w = Works().search_filter(title="Attention is all you need")
tmp = w.get()
tmp

[{'id': 'https://openalex.org/W2963403868',
  'doi': None,
  'title': 'Attention is All you Need',
  'display_name': 'Attention is All you Need',
  'relevance_score': 13292.191,
  'publication_year': 2017,
  'publication_date': '2017-06-12',
  'ids': {'openalex': 'https://openalex.org/W2963403868', 'mag': '2963403868'},
  'language': 'en',
  'primary_location': {'is_oa': False,
   'landing_page_url': 'https://arxiv.org/pdf/1706.03762v5',
   'pdf_url': None,
   'source': {'id': 'https://openalex.org/S4306400194',
    'display_name': 'arXiv (Cornell University)',
    'issn_l': None,
    'issn': None,
    'is_oa': True,
    'is_in_doaj': False,
    'host_organization': 'https://openalex.org/I205783295',
    'host_organization_name': 'Cornell University',
    'host_organization_lineage': ['https://openalex.org/I205783295'],
    'host_organization_lineage_names': ['Cornell University'],
    'type': 'repository'},
   'license': None,
   'version': None,
   'is_accepted': False,
   'is_publis