In [4]:
import pandas as pd
from fos.util import preprocess

from fos.settings import ASSETS_DIR

meta = pd.read_pickle(ASSETS_DIR / "fields/fos.pkl.gz")
meta.index = meta.index.astype(int)
id_to_name = meta.query("level == 0")["display_name"].to_dict()

from fos.model import FieldModel
field_model = FieldModel('en')

venues = pd.read_json('ai_venue_text.jsonl', lines=True)

In [7]:
venues.columns, venues.head()
venue_samples = {}
for scholar_cat, df in venues.groupby('scholar_cat'):
    venue_samples[scholar_cat] = df.sample(min(df.shape[0], 1_000), random_state=20220323)

In [8]:
[df.shape[0] for df in venue_samples.values()]

[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]

In [44]:
def score(text,
          field_model: FieldModel,
          field_meta: pd.DataFrame,
          preprocess_text=False,
          lang='en'):
    if preprocess_text:
        text = preprocess(text, lang)
    vector = field_model.embed(text)
    score = field_model.score(vector)
    avg_score = score.average()
    assert len(field_model.index) == len(avg_score)
    return {field_meta.loc[int(k), 'display_name']: v for k, v in zip(field_model.index, avg_score)}

def ordered_names(fields):
    return dict(sorted(fields.items(), key=lambda x: x[1], reverse=True))

def keep_l0(fields):
    l0_names = meta.loc[meta['level'] == 0, 'display_name'].values
    return {k: v for k, v in fields.items() if k in l0_names}

def keep_l1(fields):
    l1_names = meta.loc[meta['level'] == 1, 'display_name'].values
    return {k: v for k, v in fields.items() if k in l1_names}

In [60]:
from tqdm import tqdm

for cat, df in tqdm(venue_samples.items()):
    df['fields'] = df['text'].apply(score, field_model=field_model, field_meta=meta)

100%|██████████| 10/10 [03:43<00:00, 22.30s/it]


In [61]:
def process_scores(df):
    df['field_names'] = df['fields'].apply(ordered_names)
    df['l0'] = df['field_names'].apply(keep_l0).apply(lambda x: [k for k, v in sorted(x.items(), key=lambda y: y[1], reverse=True)])
    df['l1'] = df['field_names'].apply(keep_l1).apply(lambda x: [k for k, v in sorted(x.items(), key=lambda y: y[1], reverse=True)])
    for level in [0, 1]:
        for k in range(3):
            df[f'l{level}_k{k+1}'] = df[f'l{level}'].apply(lambda x: x[k])
    return df

for cat, df in venue_samples.items():
    venue_samples[cat] = process_scores(df)

In [65]:
cat, df.groupby(['l1_k1'])['id'].agg('count').sort_values(ascending=False)

('med_virology',
 l1_k1
 Virology               488
 Immunology             103
 Cell biology            84
 Molecular biology       36
 Genetics                24
                       ... 
 Internet privacy         1
 Literature               1
 Medical emergency        1
 Medicinal chemistry      1
 Gynecology               1
 Name: id, Length: 87, dtype: int64)

In [69]:
cats = pd.concat(venue_samples, ignore_index=True)
cats.head()

Unnamed: 0,id,scholar_cat,journal_name,year,text,fields,named_fields,field_names,l0,l1,l0_k1,l0_k2,l0_k3,l1_k1,l1_k2,l1_k3
0,2744039113,bio_agronomycropscience,Industrial Crops and Products,2016,powered by nict,{'Industrial organization': 0.1760948600414657...,"[Gynecology, Hydrology, Systems engineering, E...","{'Embedded system': 0.20845264613976014, 'Elec...","[Computer science, Engineering, Physics, Mater...","[Embedded system, Electrical engineering, Aero...",Computer science,Engineering,Physics,Embedded system,Electrical engineering,Aeronautics
1,2991192488,bio_agronomycropscience,Computers and Electronics in Agriculture,2020,hybrid extreme learning machine with metaheuri...,{'Industrial organization': 0.6385366394218659...,"[Gynecology, Hydrology, Systems engineering, E...","{'Machine learning': 0.7251366770152661, 'Algo...","[Computer science, Mathematics, Physics, Geolo...","[Machine learning, Algorithm, Pattern recognit...",Computer science,Mathematics,Physics,Machine learning,Algorithm,Pattern recognition
2,2520437996,bio_agronomycropscience,Field Crops Research,2016,n and s concentration and stoichiometry in soy...,{'Industrial organization': 0.3171993576008785...,"[Gynecology, Hydrology, Systems engineering, E...","{'Biology': 0.3488822570079546, 'Geology': 0.3...","[Biology, Geology, Chemistry, Medicine, Geogra...","[Botany, Economic growth, Geochemistry, Biotec...",Biology,Geology,Chemistry,Botany,Economic growth,Geochemistry
3,3130483981,bio_agronomycropscience,Theoretical and Applied Genetics,2021,genotyping crossing parents and family bulks c...,{'Industrial organization': 0.6401885717449108...,"[Gynecology, Hydrology, Systems engineering, E...","{'Bioinformatics': 0.696970277990277, 'Biology...","[Biology, Computer science, Engineering, Medic...","[Bioinformatics, Simulation, Genetics, Genealo...",Biology,Computer science,Engineering,Bioinformatics,Simulation,Genetics
4,2343019514,bio_agronomycropscience,Scientia Horticulturae,2016,effect of highly processed calcined kaolin res...,{'Industrial organization': 0.6312849544750182...,"[Gynecology, Hydrology, Systems engineering, E...","{'Biology': 0.6809966859453974, 'Botany': 0.67...","[Biology, Environmental science, Geology, Chem...","[Botany, Water resource management, Environmen...",Biology,Environmental science,Geology,Botany,Water resource management,Environmental engineering


In [106]:
def summarize(cats, col='l0_k1'):
    df = cats.copy()
    counts = df.groupby(['scholar_cat', col], as_index=False)[['id']].agg('count')
    counts = counts.groupby('scholar_cat').apply(lambda x: x.sort_values('id', ascending=False))
    counts = counts.reset_index(drop=True).reset_index(drop=True)
    return counts

l0_counts = summarize(cats, 'l0_k1')
l0_counts

Unnamed: 0,scholar_cat,l0_k1,id
0,bio_agronomycropscience,Biology,586
1,bio_agronomycropscience,Environmental science,157
2,bio_agronomycropscience,Chemistry,82
3,bio_agronomycropscience,Geology,75
4,bio_agronomycropscience,Computer science,38
...,...,...,...
162,med_virology,Mathematics,1
163,med_virology,Materials science,1
164,med_virology,Philosophy,1
165,med_virology,Engineering,1


In [105]:
l0_counts.set_index('scholar_cat').to_csv('venue_l0_counts.csv')

In [107]:
l1_counts = summarize(cats, 'l1_k1')
l1_counts

Unnamed: 0,scholar_cat,l1_k1,id
0,bio_agronomycropscience,Botany,223
1,bio_agronomycropscience,Soil science,118
2,bio_agronomycropscience,Genetics,84
3,bio_agronomycropscience,Agronomy,52
4,bio_agronomycropscience,Water resource management,46
...,...,...,...
1042,med_virology,Internet privacy,1
1043,med_virology,Literature,1
1044,med_virology,Medical emergency,1
1045,med_virology,Medicinal chemistry,1


In [108]:
l1_counts.set_index('scholar_cat').to_csv('venue_l1_counts.csv')

In [137]:
def summarize_journals(cats, col='l1_k1'):
    df = cats.copy()
    counts = df.groupby(['scholar_cat', 'journal_name', col], as_index=False)[['id']].agg('count')
    counts = counts.groupby(['scholar_cat', 'journal_name']).apply(lambda x: x.sort_values('id', ascending=False))
    counts = counts.reset_index(drop=True).reset_index(drop=True)
    return counts

j_counts = summarize_journals(cats)
j_counts['prop'] = j_counts.groupby(['scholar_cat', 'journal_name'])['id'].apply(lambda x: x / x.sum())
j_counts['total'] = j_counts.groupby(['scholar_cat', 'journal_name'])['id'].apply(lambda x: x.apply(lambda y: x.sum()))
j_counts = j_counts.query('total >= 100')
j_counts = j_counts.rename(columns={'id': 'n'})
j_counts.to_csv('journal_l1_counts.csv', float_format='%.2f')

heads = {}
for idx, df in j_counts.groupby(['scholar_cat', 'journal_name']):
    heads[idx] = df.iloc[:(min(df.shape[0], 5))][['l1_k1', 'prop', 'total']]
j_heads = pd.concat(heads).reset_index(level=2, drop=True)
j_heads.to_csv('journal_l1_counts_truncated.csv', float_format='%.2f')

In [136]:
j_counts

Unnamed: 0,scholar_cat,journal_name,l1_k1,n,prop,total
154,bio_agronomycropscience,Industrial Crops and Products,Botany,34,0.225166,151
155,bio_agronomycropscience,Industrial Crops and Products,Composite material,16,0.105960,151
156,bio_agronomycropscience,Industrial Crops and Products,Biochemistry,15,0.099338,151
157,bio_agronomycropscience,Industrial Crops and Products,Nuclear chemistry,14,0.092715,151
158,bio_agronomycropscience,Industrial Crops and Products,Organic chemistry,7,0.046358,151
...,...,...,...,...,...,...
2748,med_virology,Vaccine,Environmental planning,1,0.005682,176
2749,med_virology,Vaccine,Environmental resource management,1,0.005682,176
2750,med_virology,Vaccine,Gender studies,1,0.005682,176
2751,med_virology,Vaccine,Knowledge management,1,0.005682,176
