In [1]:
from collections import defaultdict
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go

from plotting import *

In [2]:
results_dir = 'results/org_analysis/'
os.makedirs(results_dir, exist_ok=True)

url = "data/ML Systems-Org view.csv"

df = pd.read_csv(url)

cutoff_date = '2018-01-01'

In [3]:
df.head()

Unnamed: 0,System,Publication date,Notability criteria,Link,Model accessibility,Code accessibility,Training dataset,Dataset accessibility,Accessibility notes,Organization,Training compute (FLOP),Country (from Organization),Organization categorization
0,Grok-2,2024-08-13,,https://x.ai/blog/grok-2,Hosted access (no API),Unreleased,,,,xAI,,United States of America,Industry
1,Grok-2 mini,2024-08-13,,https://x.ai/blog/grok-2,Hosted access (no API),Unreleased,,,,xAI,,United States of America,Industry
2,Cosine Genie,2024-08-12,,https://cosine.sh/blog/genie-technical-report,Unreleased,,Unspecified unreleased,,https://github.com/CosineAI/experiments/tree/c...,Cosine,,United Kingdom of Great Britain and Northern I...,Industry
3,EXAONE 3.0,2024-08-07,,https://arxiv.org/abs/2408.03541,Open access (non-commercial),,Unspecified unreleased,,,LG,4e+23,Korea (Republic of),Industry
4,Flux.1 [pro],2024-08-01,,https://blackforestlabs.ai/announcing-black-fo...,API access,Unreleased,,Unreleased,,Black Forest Labs,,,


In [4]:
df['Organization'].unique()

array(['xAI', 'Cosine', 'LG', 'Black Forest Labs', 'Apple',
       'Google DeepMind', 'Mistral AI', 'Meta AI', 'OpenAI', 'Alibaba',
       'EvolutionaryScale,UC Berkeley', 'Anthropic', 'Zhipu AI',
       'Zhipu AI,Tsinghua University', 'DeepSeek', 'Runway', 'NVIDIA',
       'Stanford University,UC Berkeley,Toyota Research Institute,Google DeepMind,Massachusetts Institute of Technology (MIT),Physical Intelligence',
       'Microsoft,University of Illinois Urbana-Champaign (UIUC)',
       'Facebook AI Research', 'Columbia University,Rutgers University',
       '01.AI',
       'Tohoku University,CyberAgent,Tokyo Institute of Technology,Fujitsu,RIKEN,Nagoya University,Kotoba Technologies',
       'Technology Innovation Institute', 'Tokyo Institute of Technology',
       'Microsoft', 'SenseTime', 'Sber', 'Reka AI',
       'NTT Communication Science Laboratories', 'Stability AI', 'Yandex',
       'Cohere,Cohere for AI', 'Silo AI,University of Turku',
       'Google DeepMind,McGill University

In [5]:
{**df.iloc[1]}

{'System': 'Grok-2 mini',
 'Publication date': '2024-08-13',
 'Notability criteria': nan,
 'Link': 'https://x.ai/blog/grok-2',
 'Model accessibility': 'Hosted access (no API)',
 'Code accessibility': 'Unreleased',
 'Training dataset': nan,
 'Dataset accessibility': nan,
 'Accessibility notes': nan,
 'Organization': 'xAI',
 'Training compute (FLOP)': nan,
 'Country (from Organization)': 'United States of America',
 'Organization categorization': 'Industry'}

In [6]:
aliases = {
  'Beijing Academy of Artificial Intelligence / BAAI': 'BAAI',
  'Google': 'Google/DeepMind',
  'Google Research': 'Google/DeepMind',
  'DeepMind': 'Google/DeepMind',
  'Google Brain': 'Google/DeepMind',
  'Google DeepMind': 'Google/DeepMind',
  'Facebook AI Research': 'Meta/Facebook',
  'Facebook AI Research': 'Meta/Facebook',
  'Facebook': 'Meta/Facebook',
  'Facebook AI': 'Meta/Facebook',
  'Meta AI': 'Meta/Facebook',
  'Meta AI Research': 'Meta/Facebook',
  'Massachusetts Institute of Technology (MIT)': 'MIT',
  'Stanford University': 'Stanford',
  'New York University (NYU)': 'NYU',
  'Carnegie Mellon University (CMU)': 'CMU',
  'University of Washington': 'UoW',
  'UC Berkeley': 'UCB',
  'Tsinghua University': 'Tsinghua',
}

new_rows = []
for i, row in df.dropna(subset=['Organization']).iterrows():
  orgs = row['Organization']
  orgs = orgs.split(',')
  for org in orgs:
    if org in aliases:
      org = aliases[org]
    new_row = {**row}
    new_row['Organization'] = org
    new_rows.append(new_row)

df = pd.DataFrame(new_rows)

In [7]:
df.head()

Unnamed: 0,System,Publication date,Notability criteria,Link,Model accessibility,Code accessibility,Training dataset,Dataset accessibility,Accessibility notes,Organization,Training compute (FLOP),Country (from Organization),Organization categorization
0,Grok-2,2024-08-13,,https://x.ai/blog/grok-2,Hosted access (no API),Unreleased,,,,xAI,,United States of America,Industry
1,Grok-2 mini,2024-08-13,,https://x.ai/blog/grok-2,Hosted access (no API),Unreleased,,,,xAI,,United States of America,Industry
2,Cosine Genie,2024-08-12,,https://cosine.sh/blog/genie-technical-report,Unreleased,,Unspecified unreleased,,https://github.com/CosineAI/experiments/tree/c...,Cosine,,United Kingdom of Great Britain and Northern I...,Industry
3,EXAONE 3.0,2024-08-07,,https://arxiv.org/abs/2408.03541,Open access (non-commercial),,Unspecified unreleased,,,LG,4e+23,Korea (Republic of),Industry
4,Flux.1 [pro],2024-08-01,,https://blackforestlabs.ai/announcing-black-fo...,API access,Unreleased,,Unreleased,,Black Forest Labs,,,


In [8]:
def compare_openness_with_compute(df, col_name = 'model_count'):
  def is_open(accessibility):
      return accessibility.startswith('Open') if isinstance(accessibility, str) else False

  df = df[pd.to_datetime(df['Publication date']) >= pd.to_datetime(cutoff_date)]
  df['is_open'] = df['Model accessibility'].apply(is_open)
  df['log_compute'] = np.log10(df['Training compute (FLOP)'])

  # Group by organization and get the stats
  org_stats = df.groupby('Organization').agg({
      'log_compute': ['max', 'mean'],
      'is_open': ['mean', 'count'],
      'Organization categorization': ['first'],
      'Country (from Organization)': ['first'],
      'Notability criteria': ['first'],
  }).reset_index()

  # Flatten the multi-level column index and rename columns
  org_stats.columns = ['Organization', 'Max Training Compute (FLOP)', 'Average Training Compute (FLOP)', 'open_percentage', 'model_count', 'Organization categorization', 'Country (from Organization)', 'Notability criteria']
  org_stats['open_percentage'] = org_stats['open_percentage'] * 100

  # Sort by the most compute-intensive model (descending) and reset index
  result = org_stats.sort_values('Max Training Compute (FLOP)', ascending=False).reset_index(drop=True)
  return result

In [9]:
len(df)

2601

In [10]:
df.dropna(subset=['Notability criteria'], inplace=True)
len(df)

1407

In [11]:
all_models = compare_openness_with_compute(df)
all_models.head(40)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_open'] = df['Model accessibility'].apply(is_open)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['log_compute'] = np.log10(df['Training compute (FLOP)'])


Unnamed: 0,Organization,Max Training Compute (FLOP),Average Training Compute (FLOP),open_percentage,model_count,Organization categorization,Country (from Organization),Notability criteria
0,Google/DeepMind,25.69897,21.747013,30.405405,148,"Academia,Academia,Industry,Industry,Academia,I...","United States of America,United States of Amer...",SOTA improvement
1,Meta/Facebook,25.579784,21.322178,78.823529,85,Industry,United States of America,"SOTA improvement,Training cost"
2,OpenAI,25.322219,22.424921,34.482759,29,Industry,United States of America,"SOTA improvement,Significant use"
3,NVIDIA,25.255273,22.36672,46.153846,13,Industry,United States of America,Training cost
4,Peking University,25.079181,22.523616,0.0,6,"Industry,Academia","China,China",SOTA improvement
5,ByteDance,25.079181,24.147339,0.0,3,"Industry,Academia","China,China",SOTA improvement
6,Mistral AI,25.049218,25.049218,50.0,2,Industry,France,Training cost
7,Inflection AI,25.000434,25.000239,0.0,2,Industry,United States of America,Significant use
8,Anthropic,24.587262,24.587262,0.0,5,Industry,United States of America,"Significant use,SOTA improvement"
9,Technology Innovation Institute,24.575188,23.9777,100.0,2,Government,United Arab Emirates,Training cost


In [12]:
for model_count_threshold in [3, *range(5, 55, 5)]:
  big_org_models = all_models[all_models['model_count'] >= model_count_threshold]
  correlation = big_org_models['Max Training Compute (FLOP)'].corr(big_org_models['open_percentage'], method='spearman')
  print(f"Correlation for model_count_threshold {model_count_threshold} ({len(big_org_models)} data points): {correlation}")

print("\nWithout Meta/Facebook")
for model_count_threshold in [3, *range(5, 55, 5)]:
  big_org_models = all_models[all_models['model_count'] >= model_count_threshold]
  big_org_models = big_org_models[big_org_models['Organization'] != 'Meta/Facebook']
  correlation = big_org_models['Max Training Compute (FLOP)'].corr(big_org_models['open_percentage'], method='spearman')
  print(f"Correlation for model_count_threshold {model_count_threshold} ({len(big_org_models)} data points): {correlation}")

Correlation for model_count_threshold 3 (65 data points): -0.003763671563848325
Correlation for model_count_threshold 5 (32 data points): -0.07121226295380861
Correlation for model_count_threshold 10 (13 data points): -0.2613482527433465
Correlation for model_count_threshold 15 (8 data points): -0.34731161595484505
Correlation for model_count_threshold 20 (3 data points): -0.5
Correlation for model_count_threshold 25 (3 data points): -0.5
Correlation for model_count_threshold 30 (2 data points): -0.9999999999999999
Correlation for model_count_threshold 35 (2 data points): -0.9999999999999999
Correlation for model_count_threshold 40 (2 data points): -0.9999999999999999
Correlation for model_count_threshold 45 (2 data points): -0.9999999999999999
Correlation for model_count_threshold 50 (2 data points): -0.9999999999999999

Without Meta/Facebook
Correlation for model_count_threshold 3 (64 data points): -0.03330865737596513
Correlation for model_count_threshold 5 (31 data points): -0.1344

In [22]:
model_count_threshold = 10
big_org_models = all_models[all_models['model_count'] >= model_count_threshold]
other_org_models = all_models[all_models['model_count'] < model_count_threshold]

fig = go.Figure()
fig.add_trace(
  go.Scatter(
    x=other_org_models['Max Training Compute (FLOP)'],
    y=other_org_models['open_percentage'],
    mode='markers',
    marker=dict(
      size=2*other_org_models['model_count']**0.5,
      color='blue',
      opacity=0.5,
    ),
    showlegend=False,
    # hoverinfo='text',
    # text=other_org_models['Organization'],
  )
)
fig.add_trace(
  go.Scatter(
    x=big_org_models['Max Training Compute (FLOP)'],
    y=big_org_models['open_percentage'],
    mode='markers+text',
    marker=dict(
      size=2*big_org_models['model_count']**0.5,
      color='blue',
      opacity=0.8,
    ),
    text=big_org_models['Organization'],
    textposition='top center',
    showlegend=False,
  )
)

# x limits
fig.update_xaxes({'range': [21, 27]})

# Update layout for better readability
fig.update_layout(
  width=800,
  height=600,
  xaxis_title='Max Training Compute (log-FLOP)',
  yaxis_title='Percentage of Open Models',
  template='plotly_white',
  coloraxis_showscale=False,
  title='Organization Openness vs Compute Intensity',
)

# Save plot
# save_plot(fig, results_dir, 'org_openness_vs_max_compute')

# Show the plot
fig.show()

We can also look at average compute. However, this is less informative because some organizations will have a lower average simply by being older.

In [14]:
fig = go.Figure()
fig.add_trace(
  go.Scatter(
    x=big_org_models['Average Training Compute (FLOP)'],
    y=big_org_models['open_percentage'],
    mode='markers+text',
    marker=dict(
      size=big_org_models['model_count']**0.5,
      color='blue',
      opacity=0.8,
    ),
    text=big_org_models['Organization'],
    textposition='top center',
    showlegend=False,
  )
)
fig.add_trace(
  go.Scatter(
    x=other_org_models['Average Training Compute (FLOP)'],
    y=other_org_models['open_percentage'],
    mode='markers',
    marker=dict(
      size=other_org_models['model_count']**0.5,
      color='blue',
      opacity=0.5,
    ),
    showlegend=False,
    # hoverinfo='text',
    # text=other_org_models['Organization'],
  )
)

# x limits
fig.update_xaxes({'range': [18, 24]})

# Update layout for better readability
fig.update_layout(
  width=800,
  height=600,
  xaxis_title='Average Training Compute (log-FLOP)',
  yaxis_title='Percentage of Open Models',
  template='plotly_white',
  coloraxis_showscale=False,
  title='Organization Openness vs Compute Intensity',
)

# Show the plot
fig.show()

In [15]:
country_openness = defaultdict(list)
for country in ['United States of America', 'China']:
  for i, row in all_models.dropna(subset=['Country (from Organization)', 'Notability criteria']).iterrows():
    if country in row['Country (from Organization)']:
      country_openness[country].append(row['open_percentage'])
  print(f"{country}: {np.mean(country_openness[country]):.0f}% ({np.std(country_openness[country]):.0f})")

United States of America: 52% (43)
China: 37% (42)


In [19]:
# Mann-Whitney U test on country_openness
from scipy.stats import mannwhitneyu

mannwhitneyu(country_openness['United States of America'], country_openness['China'])

MannwhitneyuResult(statistic=4430.5, pvalue=0.04116141548305803)

In [23]:
len(country_openness['United States of America']), len(country_openness['China'])

(139, 54)

In [16]:
org_type_openness = defaultdict(list)
for org_type in ['Industry', 'Academia']:
  for i, row in all_models.dropna(subset=['Organization categorization', 'Notability criteria']).iterrows():
    if org_type in row['Organization categorization']:
      org_type_openness[org_type].append(row['open_percentage'])
  print(f"{org_type}: {np.mean(org_type_openness[org_type]):.0f}% ({np.std(org_type_openness[org_type]):.0f})")

Industry: 53% (44)
Academia: 52% (44)


In [20]:
# U test
mannwhitneyu(org_type_openness['Industry'], org_type_openness['Academia'])

MannwhitneyuResult(statistic=15972.0, pvalue=0.8064253383229103)

In [24]:
len(org_type_openness['Industry']), len(org_type_openness['Academia'])

(174, 181)