Examine differences between `batch_score_corpus_constrained.py` and prior results.

In [2]:
from fos.settings import CORPUS_DIR
from tqdm import tqdm
import pandas as pd

NROWS = 100_000

In [None]:
def read_top_scores(df, field_col="id"):
    """Read scores from script output and identify top scores by level."""
    df = df.explode('fields')
    df['field'] = df['fields'].apply(lambda x: x[field_col])
    df['score'] = df['fields'].apply(lambda x: x['score'])
    df['norm_field'] = df['field'].str.lower()

    # Field levels aren't included in the scoring output
    meta = pd.read_json("../fields/field_meta.jsonl", lines=True)
    meta['norm_field'] = meta['name'].str.lower()
    df = pd.merge(
        df,
        meta[['name', 'level']],
        left_on='field',
        right_on='name',
    )

    df['rank'] = df.groupby(['merged_id', 'level'], as_index=False)['score']. \
        apply('rank', ascending=False)
    # We don't need the lower-ranked field scores from the original script output
    df = df.loc[df['rank'] <= 10, ['merged_id', 'field', 'score', 'rank']]
    return df

In [19]:
# Read results from original scoring script
chunks = []
for chunk in tqdm(pd.read_json(
        CORPUS_DIR / 'en_scores.jsonl',
        lines=True,
        nrows=NROWS,
        chunksize=5_000,
)):
    chunks.append(read_top_scores(chunk, "id"))
scores = pd.concat(chunks)
scores.head()

20it [14:02, 42.14s/it]


Unnamed: 0,merged_id,field,score,rank
1,carticle_0410190179,Art,0.412466,5.0
2,carticle_0135012857,Art,0.477203,5.0
4,carticle_0000162842,Art,0.627803,1.0
6,carticle_0157627003,Art,0.471244,5.0
7,carticle_0127058960,Art,0.469692,8.0


In [20]:
# Read results from constrained scoring script
con_scores = pd.read_json(
    CORPUS_DIR / 'en_scores_constrained.jsonl',
    lines=True,
    nrows=NROWS
)

con_scores = read_top_scores(con_scores, "name")
con_scores.head()

Unnamed: 0,merged_id,field,score,rank
0,carticle_0262135946,Chemistry,0.656,1.0
1,carticle_0135012857,Chemistry,0.3179,6.0
2,carticle_0033759524,Chemistry,0.5524,10.0
3,carticle_0043485323,Chemistry,0.2811,6.0
4,carticle_0134038379,Chemistry,0.3232,1.0


In [21]:
# More scores from unconstrained L2/L3 scoring as expected
len(scores), len(con_scores)

(3922052, 1903259)

In [23]:
# Parsing and transforming takes a while, so serialize for reuse
scores.to_pickle("scores.pkl")
con_scores.to_pickle("con_scores.pkl")

In [24]:
# Confirm we're looking at the same publications
assert set(scores.merged_id) == set(con_scores.merged_id)

In [25]:
# Create a standardized field name for joining on pub + field
scores['norm_field'] = scores['field'].str.lower()
con_scores['norm_field'] = con_scores['field'].str.lower()

diff = pd.merge(
    scores[['merged_id', 'field', 'score', 'norm_field', 'rank']],
    con_scores[['merged_id', 'field', 'score', 'norm_field', 'rank']],
    on=['merged_id', 'norm_field'],
    suffixes=('', '_c'),
    how='left',
)
diff.drop(columns=['norm_field', 'field'], inplace=True)
len(diff)

3948683

In [39]:
# Calculate absolute differences
diff['diff'] = diff['score_c'].round(3) - diff['score'].round(3)
diff['diff'] = diff['diff'].abs()
diff.head()

Unnamed: 0,merged_id,score,rank,field_c,score_c,rank_c,diff,has_diff,has_rank_diff
0,carticle_0410190179,0.412466,5.0,Art,0.275,5.0,0.137,True,False
1,carticle_0135012857,0.477203,5.0,Art,0.3181,5.0,0.159,True,False
2,carticle_0000162842,0.627803,1.0,Art,0.6278,1.0,0.0,True,False
3,carticle_0157627003,0.471244,5.0,Art,0.3142,5.0,0.157,True,False
4,carticle_0127058960,0.469692,8.0,Art,0.3131,7.5,0.157,True,False


In [40]:
# Characterize
diff['diff'].describe().round(3)

count    959952.000
mean          0.094
std           0.080
min           0.000
25%           0.000
50%           0.127
75%           0.152
max           0.531
Name: diff, dtype: float64

In [41]:
diff['has_diff'] = diff['diff'] > 0.000
diff['has_diff'].value_counts(normalize=True)

False    0.84072
True     0.15928
Name: has_diff, dtype: float64

In [42]:
diff.loc[~diff['diff'].isna()]

Unnamed: 0,merged_id,score,rank,field_c,score_c,rank_c,diff,has_diff,has_rank_diff
0,carticle_0410190179,0.412466,5.0,Art,0.2750,5.0,0.137,True,False
1,carticle_0135012857,0.477203,5.0,Art,0.3181,5.0,0.159,True,False
2,carticle_0000162842,0.627803,1.0,Art,0.6278,1.0,0.000,False,False
3,carticle_0157627003,0.471244,5.0,Art,0.3142,5.0,0.157,True,False
4,carticle_0127058960,0.469692,8.0,Art,0.3131,7.5,0.157,True,False
...,...,...,...,...,...,...,...,...,...
3885888,carticle_0423327321,0.662955,6.0,Synthetic Biology,0.6544,9.0,0.009,True,True
3908664,carticle_0177109305,0.644539,1.0,Natural Language Processing,0.6270,2.0,0.018,True,True
3919554,carticle_0235085803,0.660392,8.0,Antigenic Shift,0.6395,2.5,0.020,True,True
3919568,carticle_0032581328,0.521683,8.0,Antimicrobial Resistance,0.3403,3.0,0.182,True,True


In [43]:
diff.groupby(['field_c', 'has_diff'])['diff'].agg('count').sort_values(ascending=False)

field_c                  has_diff
Psychology               True        48642
Economics                True        44163
Environmental Science    True        43654
Physics                  True        39575
Engineering              True        38700
                                     ...  
Nanotechnology           False           1
Crystallography          False           1
Epistemology             True            1
Environmental Chemistry  False           1
Geochemistry             False           1
Name: diff, Length: 439, dtype: int64

In [44]:
diff.groupby('has_diff')['merged_id'].agg('nunique').sort_values(ascending=False)

has_diff
False    98054
True     85232
Name: merged_id, dtype: int64

In [45]:
diff['has_rank_diff'] = (diff['rank'] - diff['rank_c']).abs() > 0.5
diff['has_rank_diff'].value_counts(normalize=True)

False    0.883699
True     0.116301
Name: has_rank_diff, dtype: float64

In [46]:
diff['diff'].quantile([.25, .33, .50, .75, .95, .99, 1])

0.25    0.000
0.33    0.000
0.50    0.127
0.75    0.152
0.95    0.172
0.99    0.341
1.00    0.531
Name: diff, dtype: float64