# Imports

In [None]:
import polars as pl
import numpy as np
from json import dump

# Read data

In [2]:
df = pl.read_parquet("home/ubuntu/sample")

In [3]:
df = df[['authors', 'topics', 'high_concepts_flat']]

# Read preprocessed concept embeddings

In [4]:
concepts = np.load('concepts.npy')
concepts.shape

(1031781, 312)

# Node embeddings extraction

In [5]:
authors = {}
topics = {}
countries = {}

In [6]:
for i, concept in zip(df.iter_rows(), concepts):
    for author in i[0]:
        if author['id'] not in authors:
            authors[author['id']] = {'id': len(authors), 'topics': set(), 'concepts': [], 'countries': set()}
        authors[author['id']]['concepts'].append(concept)
        for institution in author['institutions']:
            if institution['country'] not in countries:
                countries[institution['country']] = len(countries)
            authors[author['id']]['countries'].add(countries[institution['country']])
        if i[1] is not None:
            for topic in i[1]:
                if topic['field'] not in topics:
                    topics[topic['field']] = len(topics)
                authors[author['id']]['topics'].add(topics[topic['field']])
len(authors['/A5056295479']['concepts'])

3

In [7]:
preproc = {author['id']: {'concepts': np.array(author['concepts']).mean(axis=0)} for author in authors.values()}
for author in authors.values():
    ID = author['id']

    preproc[ID]['topics'] = np.zeros((len(topics),))
    for k in author['topics']:
        preproc[ID]['topics'][k] = 1

    preproc[ID]['countries'] = np.zeros((len(countries),))
    for k in author['countries']:
        preproc[ID]['countries'][k] = 1

preproc[0]

{'concepts': array([-2.59943515e-01,  1.44795969e-01, -1.54523375e-02,  1.89175561e-01,
         2.54683822e-01, -2.13728502e-01,  4.02246378e-02,  1.93462387e-01,
         2.23417535e-01,  5.95416538e-02,  1.16616763e-01,  7.62125775e-02,
         3.31325859e-01,  2.91074008e-01,  4.55036275e-02,  3.45543474e-01,
        -1.52431369e-01, -2.22207621e-01, -1.95779249e-01,  2.26351902e-01,
         1.42640159e-01,  5.64255603e-02,  1.35495439e-01, -1.00458600e-01,
         4.27240320e-02, -1.15140535e-01, -1.48380518e-01,  2.78179079e-01,
         1.47552667e-02,  6.68614432e-02, -3.15055735e-02,  6.66925088e-02,
         5.85699677e-01,  3.34749013e-01, -9.04444698e-03, -2.33284876e-01,
         3.60989906e-02,  2.10040927e-01,  6.02730364e-02,  5.43373264e-02,
         3.80111188e-01,  4.14391495e-02, -6.82968572e-02,  2.46995106e-01,
        -4.09442902e-01, -1.80831060e-01, -2.43959442e-01, -5.35451137e-02,
        -1.78514823e-01, -1.99130833e-01, -2.91202012e-02,  3.36082950e-02,


In [8]:
final = np.array(
    [np.concatenate((values['concepts'], values['topics'], values['countries'])) for ID, values in preproc.items()])
print(final.shape)

(3244445, 559)


# Save final encodings

In [13]:
np.save('author_embeddings', final)

# Save ids for countries and topics for future tests

In [None]:
with open('countries.json', 'w') as f:
    dump(countries, f)
with open('topics.json', 'w') as f:
    dump(topics, f)