In [2]:
from functools import partial

import pandas as pd
import pandas_gbq as pbq

read_gbq = partial(pbq.read_gbq, progress_bar_type=None)

## Top L0 Publication Counts, Pre and Post

We're using a backup from 2024-07-19 as the pre-update snapshot.

Below are counts of publications by their top L0 field, post-update, along with the percentage of the corpus this represents.

In [3]:
l0_counts_post = read_gbq("""
select
  top_l0,
  count(*) as n,
  round(count(*) / sum(count(*)) over () * 100, 1) as pct
from fields_of_study_v2.top_fields
group by top_l0
order by pct desc
""")
l0_counts_post

Unnamed: 0,top_l0,n,pct
0,Biology,35793168,17.3
1,Medicine,26226284,12.7
2,Psychology,19183471,9.3
3,Materials Science,16267002,7.9
4,Computer Science,13741091,6.6
5,Chemistry,13682741,6.6
6,Environmental Science,10874115,5.3
7,Economics,10750670,5.2
8,History,9809043,4.7
9,Business,7943825,3.8


Below is the same, pre-update.

In [4]:
l0_counts_pre = read_gbq("""
select
  field.name as top_l0,
  count(*) as n,
  round(count(*) / sum(count(*)) over () * 100, 1) as pct
from staging_fields_of_study_v2.top_fields_20240719
cross join unnest(fields) as field
where field.level = 0
group by top_l0
order by pct desc
""")
l0_counts_pre

Unnamed: 0,top_l0,n,pct
0,Biology,38612800,18.6
1,Medicine,26121950,12.6
2,Psychology,17610897,8.5
3,Materials science,14960555,7.2
4,Chemistry,14796493,7.1
5,Computer science,12753764,6.2
6,Environmental science,10761756,5.2
7,History,10625528,5.1
8,Economics,9871512,4.8
9,Business,8461657,4.1


Merging and sorting for easier comparison, below. The `pct_diff` column gives the percentage-point difference in corpus shares.

In [5]:
l0_counts = pd.merge(l0_counts_pre, l0_counts_post, on='top_l0', suffixes=('_pre', '_post'))
l0_counts['n_diff'] = l0_counts['n_post'] - l0_counts['n_pre']
l0_counts['pct_diff'] = l0_counts['pct_post'] - l0_counts['pct_pre']
l0_counts.sort_values('pct_post', ascending=False).head(20)

Unnamed: 0,top_l0,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
0,Biology,38612800,18.6,35793168,17.3,-2819632,-1.3
1,Medicine,26121950,12.6,26226284,12.7,104334,0.1
2,Psychology,17610897,8.5,19183471,9.3,1572574,0.8
3,Chemistry,14796493,7.1,13682741,6.6,-1113752,-0.5
5,Economics,9871512,4.8,10750670,5.2,879158,0.4
4,History,10625528,5.1,9809043,4.7,-816485,-0.4
6,Business,8461657,4.1,7943825,3.8,-517832,-0.3
7,Geology,7138889,3.4,6851059,3.3,-287830,-0.1
8,Sociology,5792415,2.8,6214445,3.0,422030,0.2
11,Art,4674436,2.3,5834644,2.8,1160208,0.5


## Top L1 Publication Counts, Pre and Post

This section repeats the exercise from above, but for L1 fields.

Here's post-update:

In [6]:
l1_counts_post = read_gbq(
    """
    select
        top_l1,
        count(*) as n,
        round(count(*) / sum(count(*)) over () * 100, 1) as pct
    from fields_of_study_v2.top_fields
    group by top_l1
    order by pct desc
    """)
l1_counts_post

Unnamed: 0,top_l1,n,pct
0,Cardiology,5051113,2.4
1,Library Science,4429376,2.1
2,Cell Biology,4337751,2.1
3,Pedagogy,3466632,1.7
4,Literature,3262788,1.6
...,...,...,...
275,Engineering Physics,91870,0.0
276,Neoclassical Economics,18167,0.0
277,Classical Economics,102688,0.0
278,Pure Mathematics,65910,0.0


This is pre-update:

In [7]:
l1_counts_pre = read_gbq("""
    select
      field.name as top_l1,
      count(*) as n,
      round(count(*) / sum(count(*)) over () * 100, 1) as pct
    from staging_fields_of_study_v2.top_fields_20240719
    cross join unnest(fields) as field
    where field.level = 1
    group by top_l1
    order by pct desc
""")
l1_counts_pre

Unnamed: 0,top_l1,n,pct
0,Process management,60299132,28.2
1,Cell biology,3502830,1.6
2,Cardiology,3412052,1.6
3,Computer simulation,3132133,1.5
4,Mathematical optimization,2856736,1.3
...,...,...,...
276,Environmental engineering,79068,0.0
277,Biochemical engineering,62636,0.0
278,Optometry,71050,0.0
279,Neoclassical economics,16190,0.0


And together:

In [8]:
l1_counts = pd.merge(l1_counts_pre, l1_counts_post, on='top_l1', suffixes=('_pre', '_post'))
l1_counts['n_diff'] = l1_counts['n_post'] - l1_counts['n_pre']
l1_counts['pct_diff'] = l1_counts['pct_post'] - l1_counts['pct_pre']
l1_counts.sort_values('pct_post', ascending=False)

Unnamed: 0,top_l1,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
0,Cardiology,3412052,1.6,5051113,2.4,1639061,0.8
1,Pedagogy,2424051,1.1,3466632,1.7,1042581,0.6
9,Literature,1584921,0.7,3262788,1.6,1677867,0.9
2,Surgery,2226307,1.0,2614757,1.3,388450,0.3
6,Botany,1743351,0.8,2436508,1.2,693157,0.4
...,...,...,...,...,...,...,...
104,Mechanics,49471,0.0,49485,0.0,14,0.0
105,Biophysics,28980,0.0,39324,0.0,10344,0.0
45,Andrology,425003,0.2,60448,0.0,-364555,-0.2
111,Sociobiology,34374,0.0,97231,0.0,62857,0.0


The same, sorted by percentage-point difference in corpus share.

In [9]:
l1_counts.sort_values('pct_diff', ascending=False)

Unnamed: 0,top_l1,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
9,Literature,1584921,0.7,3262788,1.6,1677867,0.9
0,Cardiology,3412052,1.6,5051113,2.4,1639061,0.8
1,Pedagogy,2424051,1.1,3466632,1.7,1042581,0.6
20,Gastroenterology,1123139,0.5,2045730,1.0,922591,0.5
13,Law,1396937,0.7,2170820,1.1,773883,0.4
...,...,...,...,...,...,...,...
63,Photobiology,137448,0.1,193754,0.1,56306,0.0
89,Calculus,156101,0.1,275818,0.1,119717,0.0
18,Chromatography,1012157,0.5,929339,0.4,-82818,-0.1
45,Andrology,425003,0.2,60448,0.0,-364555,-0.2
