In [7]:
from functools import partial

import pandas as pd
import pandas_gbq as pbq

read_gbq = partial(pbq.read_gbq, progress_bar_type=None)

## Top L0 Publication Counts, Pre and Post

We're using a backup from 2024-07-19 as the pre-update snapshot.

Below are counts of publications by their top L0 field, post-update, along with the percentage of the corpus this represents.

In [8]:
l0_counts_post = read_gbq("""
select
  top_l0,
  count(*) as n,
  round(count(*) / sum(count(*)) over () * 100, 1) as pct
from fields_of_study_v2.top_fields
group by top_l0
order by pct desc
""")
l0_counts_post

Unnamed: 0,top_l0,n,pct
0,Psychology,29185375,14.1
1,Medicine,27967352,13.5
2,Biology,24078646,11.6
3,Materials Science,14207251,6.9
4,History,11957866,5.8
5,Computer Science,11809944,5.7
6,Philosophy,11583319,5.6
7,Chemistry,10551545,5.1
8,Economics,9121587,4.4
9,Mathematics,7687274,3.7


Below is the same, pre-update.

In [11]:
l0_counts_pre = read_gbq("""
select
  field.name as top_l0,
  count(*) as n,
  round(count(*) / sum(count(*)) over () * 100, 1) as pct
from staging_fields_of_study_v2.top_fields_20240719
cross join unnest(fields) as field
where field.level = 0
group by top_l0
order by pct desc
""")
l0_counts_pre

Unnamed: 0,top_l0,n,pct
0,Biology,38612800,18.6
1,Medicine,26121950,12.6
2,Psychology,17610897,8.5
3,Materials science,14960555,7.2
4,Chemistry,14796493,7.1
5,Computer science,12753764,6.2
6,Environmental science,10761756,5.2
7,History,10625528,5.1
8,Economics,9871512,4.8
9,Business,8461657,4.1


Merging and sorting for easier comparison, below. The `pct_diff` column gives the percentage-point difference in corpus shares.

In [24]:
l0_counts = pd.merge(l0_counts_pre, l0_counts_post, on='top_l0', suffixes=('_pre', '_post'))
l0_counts['n_diff'] = l0_counts['n_post'] - l0_counts['n_pre']
l0_counts['pct_diff'] = l0_counts['pct_post'] - l0_counts['pct_pre']
l0_counts.sort_values('pct_post', ascending=False).head(20)

Unnamed: 0,top_l0,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
2,Psychology,17610897,8.5,29185375,14.1,11574478,5.6
1,Medicine,26121950,12.6,27967352,13.5,1845402,0.9
0,Biology,38612800,18.6,24078646,11.6,-14534154,-7.0
4,History,10625528,5.1,11957866,5.8,1332338,0.7
13,Philosophy,3952267,1.9,11583319,5.6,7631052,3.7
3,Chemistry,14796493,7.1,10551545,5.1,-4244948,-2.0
5,Economics,9871512,4.8,9121587,4.4,-749925,-0.4
10,Mathematics,5040931,2.4,7687274,3.7,2646343,1.3
11,Physics,4711357,2.3,7267021,3.5,2555664,1.2
6,Business,8461657,4.1,6873742,3.3,-1587915,-0.8


## Top L1 Publication Counts, Pre and Post

This section repeats the exercise from above, but for L1 fields.

Here's post-update:

In [17]:
l1_counts_post = read_gbq(
    """
    select
        top_l1,
        count(*) as n,
        round(count(*) / sum(count(*)) over () * 100, 1) as pct
    from fields_of_study_v2.top_fields
    group by top_l1
    order by pct desc
    """)
l1_counts_post

Unnamed: 0,top_l1,n,pct
0,Arithmetic,5319612,2.6
1,Geophysics,4508455,2.2
2,Artificial Intelligence,4429799,2.1
3,Nuclear Medicine,3503060,1.7
4,Gerontology,3403610,1.6
...,...,...,...
272,Earth Science,42168,0.0
273,Finance,102807,0.0
274,Thermodynamics,16,0.0
275,Toxicology,129,0.0


This is pre-update:

In [21]:
l1_counts_pre = read_gbq("""
    select
      field.name as top_l1,
      count(*) as n,
      round(count(*) / sum(count(*)) over () * 100, 1) as pct
    from staging_fields_of_study_v2.top_fields_20240719
    cross join unnest(fields) as field
    where field.level = 1
    group by top_l1
    order by pct desc
""")
l1_counts_pre

Unnamed: 0,top_l1,n,pct
0,Process management,60299132,28.2
1,Cell biology,3502830,1.6
2,Cardiology,3412052,1.6
3,Computer simulation,3132133,1.5
4,Mathematical optimization,2856736,1.3
...,...,...,...
276,Biochemical engineering,62636,0.0
277,Biophysics,28980,0.0
278,Pure mathematics,49705,0.0
279,Architectural engineering,106426,0.0


And together:

In [25]:
l1_counts = pd.merge(l1_counts_pre, l1_counts_post, on='top_l1', suffixes=('_pre', '_post'))
l1_counts['n_diff'] = l1_counts['n_post'] - l1_counts['n_pre']
l1_counts['pct_diff'] = l1_counts['pct_post'] - l1_counts['pct_pre']
l1_counts.sort_values('pct_post', ascending=False)

Unnamed: 0,top_l1,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
88,Arithmetic,166282,0.1,5319612,2.6,5153330,2.5
48,Geophysics,443182,0.2,4508455,2.2,4065273,2.0
34,Gerontology,588562,0.3,3403610,1.6,2815048,1.3
108,Radiochemistry,99180,0.0,3020930,1.5,2921750,1.5
0,Cardiology,3412052,1.6,2815897,1.4,-596155,-0.2
...,...,...,...,...,...,...,...
38,Anesthesia,690851,0.3,46523,0.0,-644328,-0.3
41,Thermodynamics,600205,0.3,16,0.0,-600189,-0.3
43,Finance,323584,0.2,102807,0.0,-220777,-0.2
68,Paleontology,143168,0.1,92204,0.0,-50964,-0.1


The same, sorted by percentage-point difference in corpus share.

In [23]:
l1_counts.sort_values('pct_diff', ascending=False)

Unnamed: 0,top_l1,n_pre,pct_pre,n_post,pct_post,n_diff,pct_diff
88,Arithmetic,166282,0.1,5319612,2.6,5153330,2.5
48,Geophysics,443182,0.2,4508455,2.2,4065273,2.0
108,Radiochemistry,99180,0.0,3020930,1.5,2921750,1.5
34,Gerontology,588562,0.3,3403610,1.6,2815048,1.3
91,Geometry,316063,0.1,2201835,1.1,1885772,1.0
...,...,...,...,...,...,...,...
1,Pedagogy,2424051,1.1,1151019,0.6,-1273032,-0.5
10,Geochemistry,1575379,0.7,284041,0.1,-1291338,-0.6
13,Oncology,1517478,0.7,199803,0.1,-1317675,-0.6
3,Biochemistry,1838850,0.9,671921,0.3,-1166929,-0.6
