We've downloaded the 100 highest-scoring papers from MAG for each level-0 field. These papers are exemplars of that field, and we'd expect our model to score them highly in that field too.

In [88]:
from collections import defaultdict, Counter

from fos.model import FieldModel
from fos.settings import ASSETS_DIR
import pandas as pd
import numpy as np

from fos.entity import embed_entities
from fos.vectors import embed_fasttext, embed_tfidf

mag_texts = pd.read_pickle(ASSETS_DIR / 'fields/example_text.pkl.gz')
meta = pd.read_pickle(ASSETS_DIR / 'fields/fos.pkl.gz')
fields = FieldModel("en")

In [98]:
def score(doc_vector, field_index, field_matrix):
    scores = pd.DataFrame({'field_id': field_index, 'score': field_matrix[doc_vector]})
    scores = pd.merge(scores, meta[['display_name', 'level']], left_on='field_id', right_index=True)
    scores = scores.loc[scores.level == 0].sort_values('score', ascending=False)
    return scores

def print_ranks(ranks):
    rank_freqs = ranks.groupby(['field'])['rank'].apply(pd.value_counts) / 100
    return rank_freqs.reset_index().pivot('field', columns='level_1').fillna('')

def print_errors(errors):
    for field, counts in errors.items():
        print(f'{field}:')
        for other_field, n in counts.most_common(19):
            print(f'    {other_field:<18} {n}')

def print_texts(texts):
    for text in texts:
        print(text['field'], '<', text['higher-scoring fields'])
        print('    ', text['text'])
        print()

def score_exemplars(mag_texts, embed_function, fields, field_matrix, *embed_args, split=False):
    ranks = []
    texts = []
    errors = defaultdict(Counter)
    for _, doc in mag_texts.iterrows():
        field = doc['display_name']
        if split:
            text = doc['text'].split()
        else:
            text = doc['text']
        doc_vector = embed_function(text, *embed_args)
        scores = score(doc_vector, fields.index, field_matrix)
        rank = np.where(scores['display_name'] == doc['display_name'])[0][0] + 1
        ranks.append((doc['display_name'], rank))
        i = np.where(scores['display_name'] == doc['display_name'])[0][0]
        if i != 0:
            texts.append({
                'field': field,
                'higher-scoring fields': '; '.join(scores.iloc[:i]['display_name'].values),
                'text': doc['text']
            })
            for j in range(i):
                errors[field].update([scores.iloc[j]['display_name']])
    ranks = pd.DataFrame(ranks)
    ranks.columns = ['field', 'rank']
    return ranks, errors, texts



In [48]:
# Show an example for each L0 field
for _, row in mag_texts.drop_duplicates('display_name').iterrows():
    print(f"{row['display_name']:<18}", '\t', row['text'][:90])

Art                	 the search for aesthetic meaning in the visual arts the need for the aesthetic tradition i
Biology            	 geographic distribution of the e1 family of genes and their effects on reproductive timing
Business           	 using the financial and business literature electronic resources accounting advertising af
Chemistry          	 the fate of amino acids adsorbed on mineral matter abstract we present here selected resul
Computer science   	 integrating memory consistency models and communication systems the shared memory paradigm
Economics          	 essays in economic theory preface biographical sketch alaknanda patel introduction partha 
Engineering        	 by engineers for engineers the bergeron centre for engineering excellence is more than jus
Environmental science 	 a processbased inventory model for landfill ch4 emissions inclusive of seasonal soil micro
Geography          	 the geography of manitoba its land and its people manitoba is more than one of c

## fasttext

We score them, then see where their high-scoring MAG field ranked among our L0 scores.

In [None]:
ranks = []
i = 0
for _, doc in mag_texts.iterrows():
    # embed with fasttext
    doc_vector = embed_fasttext(doc['text'], fields.fasttext)
    # score the vector against field embeddings
    scores = score(doc_vector, field.index, fields.field_fasttext)
    rank = np.where(scores['display_name'] == doc['display_name'])[0][0] + 1
    ranks.append((doc['display_name'], rank))
    i += 1
    if i % 500 == 0:
        print(i)  # this takes a little while
ranks = pd.DataFrame(ranks)
ranks.columns = ['field', 'rank']

In [None]:
ranks, errors, texts = score_exemplars(mag_texts, embed_fasttext, fields, fields.field_fasttext, fields.fasttext)

In [20]:
print_ranks(ranks)

Unnamed: 0_level_0,rank,rank,rank,rank,rank,rank,rank
level_1,1,2,3,4,5,7,9
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Art,0.54,0.26,0.2,,,,
Biology,0.99,0.01,,,,,
Business,0.98,0.02,,,,,
Chemistry,0.88,0.12,,,,,
Computer science,1.0,,,,,,
Economics,0.84,0.05,0.07,0.02,0.01,0.01,
Engineering,0.82,0.12,0.02,0.01,0.02,,0.01
Environmental science,0.67,0.3,0.03,,,,
Geography,0.86,0.08,0.02,0.03,0.01,,
Geology,1.0,,,,,,


This looks fairly good. For instance, our top field for all of the exemplar CS papers is CS.

Disagreement is highest in physics, environmental science, art, and sociology.

Let's see which fields we're scoring higher than these, when they aren't the top field.

In [52]:
errors = {}
texts = []
for field in ['Physics', 'Environmental science', 'Art', 'Sociology']:
    errors[field] = Counter()
    for _, doc in mag_texts.loc[mag_texts.display_name == field, ].iterrows():
        # embed with fasttext
        doc_vector = embed_fasttext(doc['text'], fields.fasttext)
        # score the vector against field embeddings
        scores = pd.DataFrame({'field_id': fields.index, 'score': fields.field_fasttext[doc_vector]})
        scores = pd.merge(scores, meta[['display_name', 'level']], left_on='field_id', right_index=True)
        scores = scores.loc[scores.level == 0].sort_values('score', ascending=False)
        i = np.where(scores['display_name'] == doc['display_name'])[0][0]
        if i != 0:
            texts.append({
                'field': field,
                'higher-scoring fields': '; '.join(scores.iloc[:i]['display_name'].values),
                'text': doc['text']
            })
            for j in range(i):
                errors[field].update([scores.iloc[j]['display_name']])

In [53]:
print_errors(errors)

Physics:
    Chemistry          52
    Materials science  5
    Biology            4
    Geology            1
Environmental science:
    Geology            32
    Chemistry          2
    Materials science  2
Art:
    Philosophy         42
    History            24
Sociology:
    Political science  18
    History            8
    Psychology         5
    Philosophy         1
    Geography          1
    Art                1


Finally let's take a look at the text.

In [58]:
for text in texts:
    print(text['field'], '<', text['higher-scoring fields'])
    print('    ', text['text'])
    print()

Physics < Chemistry
     properties of narrowu31 based on themdiquonium 651165116511interpretation we study the properties ofu31 assuming that theu is anmdiquoniumsqbar q2q u ord state it is shown that the annihilation decay which becomes the most important for usual diquonia is forbidden foru we show there exist various reasons which makeu narrow nearu31 we expect other narrow diquonia we also compute the electromagnetic mass splitting and find thatu is the heaviest andu0 is the lightest

Physics < Chemistry
     on electromagnetic corrections in mue decay electromagnetic corrections to the angular distribution of electrons were obtained for the v a theory of mu e decay auth

Physics < Chemistry
     existence of atoms and molecules in nonrelativistic quantum electrodynamics we show that the hamiltonian describing n nonrelativistic electrons with spin interacting with the quantized radiation field and several fixed nuclei with total charge z has a ground state when n z the result hold

Sociology < Political science; History
     dynamic literacies and democracy a framework for historical literacy a stated goal of australian schooling is that all students will become active and informed citizens mceetya melbourne declaration of educational goals for young australians barton act ministerial council on education employment training and youth affairs accordingly national education policy and curriculum reforms are increasingly concerned with the attributes or qualities that may be required for an individual to be a successful citizen in the twentyfirst century research in history education has espoused the potential of studying history to help young people to prepare for the kind of reasoning and informed decision making that will be required for participatory citizenship for examples see sam wineburg why learn history when its already on your phone chicago university of chicago press keith barton agency choice and historical action how history teaching can help students

## tf-idf

In [None]:
ranks, errors, texts = score_exemplars(mag_texts, embed_tfidf, fields, fields.field_tfidf, fields.tfidf, fields.dictionary, split=True)

In [77]:
print_ranks(ranks)

Unnamed: 0_level_0,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank
level_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Art,0.92,0.08,,,,,,,,,,,,,,,
Biology,0.99,0.01,,,,,,,,,,,,,,,
Business,0.96,0.03,0.01,,,,,,,,,,,,,,
Chemistry,0.33,0.65,0.02,,,,,,,,,,,,,,
Computer science,0.55,0.3,0.04,0.06,0.01,0.01,0.02,,,,0.01,,,,,,
Economics,0.9,0.09,0.01,,,,,,,,,,,,,,
Engineering,0.92,0.05,0.01,0.02,,,,,,,,,,,,,
Environmental science,0.35,0.36,0.12,0.06,,0.04,0.03,0.01,,,,0.01,,,0.02,,
Geography,0.48,0.06,0.07,0.02,0.06,0.03,0.04,0.03,0.05,0.02,0.03,0.02,0.03,0.03,0.01,0.02,
Geology,0.97,0.03,,,,,,,,,,,,,,,


## entity

In [91]:
ranks, errors, texts = score_exemplars(mag_texts, embed_entities, fields, fields.field_entities, fields.entities)

In [92]:
print_ranks(ranks)

Unnamed: 0_level_0,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank,rank
level_1,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
field,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Art,0.9,0.03,0.01,0.01,0.01,,,0.03,0.01,,,,,,,,,,
Biology,0.25,0.02,,0.01,0.02,,,0.29,0.01,,0.01,0.02,0.01,0.36,,,,,
Business,0.81,0.07,,,0.01,0.07,0.01,0.02,,,,,,,,0.01,,,
Chemistry,0.17,0.05,,0.71,0.01,,,0.01,,,,,,,0.02,0.01,,0.02,
Computer science,0.69,0.21,0.01,,,,0.02,,0.01,,,0.01,,,,0.05,,,
Economics,0.81,0.02,,0.02,0.1,0.01,,,0.04,,,,,,,,,,
Engineering,0.7,0.23,0.01,,0.02,,,0.01,,,0.02,0.01,,,,,,,
Environmental science,0.1,0.02,0.05,0.08,0.07,0.13,0.04,0.07,0.04,0.03,0.04,0.01,,0.02,0.04,,0.26,,
Geography,0.4,0.07,0.09,0.01,0.04,0.02,0.15,0.04,0.01,,,,,,,,,,0.17
Geology,0.18,0.05,0.05,,0.02,,,0.04,0.02,0.4,0.04,0.07,,0.01,,0.06,0.06,,


In [95]:
print_errors(errors)

Art:
    Sociology          7
    History            6
    Psychology         5
    Engineering        4
    Economics          4
    Materials science  3
    Chemistry          3
    Business           3
    Political science  2
    Computer science   1
    Biology            1
    Geography          1
    Philosophy         1
Biology:
    Psychology         70
    Sociology          70
    Political science  70
    History            70
    Art                68
    Philosophy         67
    Economics          43
    Engineering        40
    Business           40
    Geography          38
    Physics            37
    Materials science  36
    Chemistry          36
    Geology            36
    Computer science   6
    Medicine           3
    Environmental science 3
    Mathematics        3
Business:
    Economics          17
    Psychology         10
    Engineering        7
    History            6
    Sociology          6
    Political science  6
    Art                6
    Mat

In [96]:
print_texts(texts)

Art < Psychology; Engineering; Materials science; Chemistry; Economics; Business; Sociology
     alex livingston vistas pope presents livingstons paintings as being concerned with modernism beauty and reality biographical notes bibl ref

Art < History
     19th century art part painting changes in history painting crossing the atlantic angloamerican connections and the wooing of john singleton copley france jacqueslouis david challenging apollo david and the martyrdom of jeanpaul marat francisco de goya y lucientes goya and the imaging of royalty in spain the rise of romanticism in england the neoclassicromantic dilemma painting in france after david the primitifs an early artistic brotherhood in the nineteenth century the image of the ruler varieties of landscape painting the nazarenes the nazarenes german romantics in rome romantic meditations in germany and france sculpture introduction england scandinavia france a pedestrian statue houdon jefferson and washington antonio canova the

     tracking the chemical footprint of surfacerunoff infiltration on groundwater recharge in an arid region this research as part of the nye county nuclear waste repository project office nwrpo attempts to provide new insight into the chemical evolution of southern nevadas groundwater its potential flow paths infiltration rates and surfacerunoff processes through initiating a surfacerunoff sampling network the sampling network tracks the chemical footprint of the surfacerunoff water and groundwater recharging infiltration chemistry by collecting baseline data through a long term study on a comprehensive suite of chemical parameters these parameters include major ion chemistry nutrients trace elements and stable isotope ratios multiple analytical methods are employed to analyze this data to develop a defensible groundwater chemistry monitoring network downgradient of yucca mountain suitable for longterm performance confirmation monitoring this study includes precipitation water chemist


Philosophy < Psychology; Engineering; Materials science; Chemistry; Economics; Business; Sociology; Art
     hamanns metakritiek en de bronnen van de angelsaksische cultuurfilosofie hamanns view of language is at once the most central and the most original doctrine in the rich and disordered world of his ideas and perhaps the most fertile from the seed that he planted ... developed herders linguistic historicism and psychologism and nothing would have horrified hamann more deeply a powerful factor in modern linguistic analysis i berlin the magus of the north jg hamann and the origins of modern irrationalism john muray london p the proponents and developers of the romantic theory have been among the most passionate critics of the epistemologicai tradition from hamanns review of kants critique of pure reason to the writings in our century of heidegger of the later wittgenstein and of certain postmodernists charles taylor philosophical arguments cambridge harvard university press p ix

P