Agreement between crosswalked CNKI CLC codes and fields.

In [1]:
%load_ext google.cloud.bigquery
%matplotlib inline

In [2]:
import pandas as pd
import pandas_gbq
import seaborn as sns
from matplotlib import pyplot as plt
import os
from google.cloud.bigquery import magics
from fos.util import preprocess, preprocess_text

magics.context.project = 'gcp-cset-projects'
pandas_gbq.context.project = 'gcp-cset-projects'
pandas_gbq.context.dialect = 'standard'

sns.set_theme(style="whitegrid")

In [3]:
from fos.model import FieldModel

field_model = FieldModel("zh")

In [4]:
%%bigquery sample
select *
from field_model_replication.cnki_papers_with_mag_fields
tablesample system (1 percent)

Query complete after 0.02s: 100%|██████████| 2/2 [00:00<00:00, 237.52query/s]                         
Downloading: 100%|██████████| 535737/535737 [00:07<00:00, 70915.24rows/s] 


In [41]:
sample = sample.sample(n=10_000, random_state=20220323)
sample.set_index('merged_id', inplace=True)

In [42]:
sample = sample.rename(columns=lambda x: x.replace('_foreign', ''))
sample['text'] = sample.apply(preprocess_text, lang="zh", axis=1)

In [43]:
sample.shape

(10000, 4)

In [44]:
from tqdm import tqdm

vectors = {}
for merged_id, row in tqdm(sample.iterrows()):
    vectors[merged_id] = field_model.embed(row['text'])

10000it [00:39, 255.49it/s]


In [45]:
scores = {}
for merged_id, v in tqdm(vectors.items()):
    scores[merged_id] = field_model.score(v)

100%|██████████| 10000/10000 [03:16<00:00, 50.77it/s]


In [46]:
avg_scores = {}
for merged_id, s in tqdm(scores.items()):
    avg_scores[merged_id] = {field_id: score for field_id, score in zip(field_model.index, s.average())}

100%|██████████| 10000/10000 [00:00<00:00, 17105.25it/s]


In [47]:
meta = pd.read_pickle("../../assets/fields/fos.pkl.gz")

In [48]:
named_scores = {merged_id: {meta.loc[k, 'display_name']: v for k, v in scores.items()} for merged_id, scores in avg_scores.items()}

In [53]:
def ordered_names(fields):
    return dict(sorted(fields.items(), key=lambda x: x[1], reverse=True))

def keep_l0(fields):
    l0_names = meta.loc[meta['level'] == 0, 'display_name'].values
    return {k: v for k, v in fields.items() if k in l0_names}

def keep_l1(fields):
    l1_names = meta.loc[meta['level'] == 1, 'display_name'].values
    return {k: v for k, v in fields.items() if k in l1_names}


In [54]:
l0_scores = {merged_id: keep_l0(scores) for merged_id, scores in named_scores.items()}
l1_scores = {merged_id: keep_l1(scores) for merged_id, scores in named_scores.items()}

In [55]:
l0_scores = {merged_id: ordered_names(fields) for merged_id, fields in l0_scores.items()}
l1_scores = {merged_id: ordered_names(fields) for merged_id, fields in l1_scores.items()}

In [60]:
top_l0 = {merged_id: next(iter(scores.keys())) for merged_id, scores in l0_scores.items()}

In [68]:
top_l1 = {merged_id: next(iter(scores.keys())) for merged_id, scores in l1_scores.items()}

In [69]:
sample_labels = pd.merge(sample[['NormalizedName']].reset_index(),
                         meta[['normalized_name', 'display_name', 'level']],
                         left_on='NormalizedName',
                         right_on='normalized_name',
                         how='inner')

In [70]:
top_l1_df = pd.DataFrame.from_dict(top_l1, orient='index')
top_l1_df.columns = ['l1_pred']
top_l1_df.head()

Unnamed: 0,0
carticle_0217688961,Library science
carticle_0200563027,World Wide Web
carticle_0064904030,Speech recognition
carticle_0004159012,Speech recognition
carticle_0224295938,Computer security


In [72]:
df = pd.merge(sample_labels, top_l1_df, left_on='merged_id', right_index=True)

In [80]:
cs_l1 = """\
Distributed computing
Software engineering
Knowledge management
Human–computer interaction
Data mining
Simulation
Telecommunications
Natural language processing
Computer graphics (images)
Internet privacy
Computer hardware
Artificial intelligence
Database
Library science
Theoretical computer science
Computer engineering
Parallel computing
Computer network
Embedded system
Speech recognition
Data science
Multimedia
Information retrieval
Programming language
Real-time computing
World Wide Web
Computational science
Pattern recognition
Computer vision
Computer architecture
Machine learning
Computer security
Algorithm
Operating system""".split('\n')

In [77]:
# df = pd.read_csv("cnki_clc_vs_pred.csv")
# df.to_csv("cnki_clc_vs_pred.csv")


In [81]:
df.head()

Unnamed: 0,merged_id,NormalizedName,normalized_name,display_name,level,l1_pred
0,carticle_0217688961,library science,library science,Library science,1,Library science
1,carticle_0237851942,library science,library science,Library science,1,Archaeology
2,carticle_0239956054,library science,library science,Library science,1,Library science
3,carticle_0224375811,library science,library science,Library science,1,Algorithm
4,carticle_0238978381,library science,library science,Library science,1,Operating system


In [85]:
df.groupby('display_name')['merged_id'].agg('count')

display_name
Artificial intelligence    2170
Computer network           4260
Information retrieval        72
Library science            1080
Operating system            478
Telecommunications         1940
Name: merged_id, dtype: int64

In [86]:
df.groupby('l1_pred')['merged_id'].agg('count').sort_values(ascending=False)

l1_pred
Algorithm                     1838
Accounting                     822
Library science                619
World Wide Web                 544
Speech recognition             468
                              ... 
Physical therapy                 1
Linguistics                      1
Computer architecture            1
Public economics                 1
Human–computer interaction       1
Name: merged_id, Length: 171, dtype: int64

In [92]:
from sklearn.metrics import classification_report
df = df.query("display_name != 'Telecommunications'")
df = df.query("display_name != 'Information retrieval'")
labels = df['display_name'].unique()
print(classification_report(df['display_name'], df['l1_pred'], labels=labels))

                         precision    recall  f1-score   support

        Library science       0.95      0.52      0.67      1080
       Computer network       0.98      0.09      0.16      4260
Artificial intelligence       0.89      0.10      0.19      2170
       Operating system       0.68      0.52      0.59       478

              micro avg       0.88      0.18      0.29      7988
              macro avg       0.87      0.31      0.40      7988
           weighted avg       0.93      0.18      0.26      7988



In [94]:
counts = df.groupby(['display_name', 'l1_pred'], as_index=False)[['merged_id']].\
    agg('count').sort_values(['display_name', 'merged_id'], ascending=False)
counts.to_csv("cnki_mag_pred_counts.csv", index=False)