In [1]:
from os import environ
from pyalex import config, Topics, Works

In [2]:
config.email = environ.get("OPENALEX_CONTACT_EMAIL")
config.email

'seuring@campus.tu-berlin.de'

In [3]:
# Limit to the field of Computer Science (for now)
Topics().filter(field={'id': 17}).get(return_meta=True)[1]

{'count': 302,
 'db_response_time_ms': 30,
 'page': 1,
 'per_page': 25,
 'groups_count': None}

In [4]:
from itertools import chain

topics = []
for topic in chain(*Topics().paginate(per_page=200)):
    topics.append(topic)

In [5]:
topics[0]

{'id': 'https://openalex.org/T13602',
 'display_name': 'Enhancing Digital Literacy through Education',
 'description': 'This cluster of papers focuses on enhancing digital literacy and education, with a particular emphasis on e-learning, school literacy movement, information literacy, and reading interest. The role of teachers in promoting literacy, multiliteracy paradigm, and early reading programs are also explored.',
 'keywords': ['Digital Literacy',
  'E-Learning',
  'School Literacy Movement',
  'Information Literacy',
  'Reading Interest',
  'Teacher Role',
  'Multiliteracy Paradigm',
  'Media Literacy',
  'Early Reading',
  'Literacy Program'],
 'ids': {'openalex': 'https://openalex.org/T13602',
  'wikipedia': 'https://en.wikipedia.org/wiki/Digital_literacy'},
 'subfield': {'id': 'https://openalex.org/subfields/1710',
  'display_name': 'Information Systems'},
 'field': {'id': 'https://openalex.org/fields/17',
  'display_name': 'Computer Science'},
 'domain': {'id': 'https://open

In [6]:
# create dataframe
import pandas as pd
df = pd.DataFrame(topics)

In [7]:
df.head(3)

Unnamed: 0,id,display_name,description,keywords,ids,subfield,field,domain,works_count,cited_by_count,updated_date,created_date
0,https://openalex.org/T13602,Enhancing Digital Literacy through Education,This cluster of papers focuses on enhancing di...,"[Digital Literacy, E-Learning, School Literacy...","{'openalex': 'https://openalex.org/T13602', 'w...","{'id': 'https://openalex.org/subfields/1710', ...","{'id': 'https://openalex.org/fields/17', 'disp...","{'id': 'https://openalex.org/domains/3', 'disp...",248836,106640,2024-02-21T22:43:16.673187,2024-01-23
1,https://openalex.org/T10181,Statistical Machine Translation and Natural La...,This cluster of papers focuses on statistical ...,"[Statistical Machine Translation, Neural Machi...","{'openalex': 'https://openalex.org/T10181', 'w...","{'id': 'https://openalex.org/subfields/1702', ...","{'id': 'https://openalex.org/fields/17', 'disp...","{'id': 'https://openalex.org/domains/3', 'disp...",208791,1820996,2024-02-21T22:43:51.033212,2024-01-23
2,https://openalex.org/T10320,Neural Network Fundamentals and Applications,This cluster of papers covers a wide range of ...,"[Neural Networks, Self-Organizing Maps, Backpr...","{'openalex': 'https://openalex.org/T10320', 'w...","{'id': 'https://openalex.org/subfields/1702', ...","{'id': 'https://openalex.org/fields/17', 'disp...","{'id': 'https://openalex.org/domains/3', 'disp...",187127,2285623,2024-02-21T22:42:48.999851,2024-01-23


In [8]:
# save to csv
df.to_csv("topics_cs.csv")

In [9]:
# Create string representation for embeddings
df["embedding_input"] = ("Topic: " + df["display_name"].str.strip() + "; Description: " + df["description"].str.strip() + "; Keywords: " + df["keywords"].apply(lambda x: ', '.join(x)).str.strip())
df.head(3)["embedding_input"]

0    Topic: Enhancing Digital Literacy through Educ...
1    Topic: Statistical Machine Translation and Nat...
2    Topic: Neural Network Fundamentals and Applica...
Name: embedding_input, dtype: object

In [26]:
# prepare for embedding
df = df[['id', 'embedding_input']]
df = df.dropna()
len(df)

302

In [12]:
import tiktoken
encoding = tiktoken.get_encoding('cl100k_base')
price_per_token = 0.00002 / 1000 # text-embedding-3-small
df["n_tokens"] = df["embedding_input"].apply(lambda x: len(encoding.encode(x)))
f"Total number of tokens: {df['n_tokens'].sum()} | Price per token: {price_per_token} | Total price: {df['n_tokens'].sum() * price_per_token}"

'Total number of tokens: 33879 | Price per token: 2e-08 | Total price: 0.00067758'

In [13]:
from openai import OpenAI
client = OpenAI() # requires OPENAI_API_KEY environment variable to be set

EMBEDDING_MODEL="text-embedding-3-small"
def get_embedding(text, model=EMBEDDING_MODEL):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


df["embedding"] = df["embedding_input"].apply(lambda x: get_embedding(x))

In [28]:
df.to_csv("topics_cs_embeddings.csv", index=False)

Unnamed: 0,id,embedding_input,n_tokens,embedding
0,https://openalex.org/T13602,Topic: Enhancing Digital Literacy through Educ...,107,"[0.02839984931051731, 0.015877598896622658, 0...."
1,https://openalex.org/T10181,Topic: Statistical Machine Translation and Nat...,116,"[0.0010130326263606548, 0.005919080693274736, ..."
2,https://openalex.org/T10320,Topic: Neural Network Fundamentals and Applica...,101,"[-0.014948883093893528, -0.010005757212638855,..."
3,https://openalex.org/T13734,Topic: Artificial Intelligence and Expert Syst...,118,"[-0.010487883351743221, 0.006784500554203987, ..."
4,https://openalex.org/T10715,Topic: Distributed Grid Computing Systems; Des...,91,"[0.0013633642811328173, -0.005792532116174698,..."
...,...,...,...,...
297,https://openalex.org/T14174,Topic: Smart Systems and IoT Applications; Des...,85,"[-0.007974784821271896, -0.005506804678589106,..."
298,https://openalex.org/T14455,Topic: Data Communications and Networking Tech...,134,"[0.019931675866246223, -0.0016256003873422742,..."
299,https://openalex.org/T13345,Topic: Efficient Cooperative Image Transmissio...,100,"[0.05508171766996384, -0.022186990827322006, -..."
300,https://openalex.org/T13932,Topic: Understanding the Digital Economy and K...,114,"[0.012202342972159386, -0.001700123306363821, ..."


In [34]:
import ast
import numpy as np
df['embedding'] = df['embedding'].apply(ast.literal_eval).apply(np.array)

In [49]:
query = "Leveraging Large Language Models for Personalized Suggestion and Summarization of Scientific Publications"
query_embedding = get_embedding(query)

In [50]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Reshape query_embedding to 2D if it's 1D
query_embedding_2d = np.array(query_embedding).reshape(1, -1)

# Calculate cosine similarity with every topic embedding
# Assuming each row in df['embeddings'] is an iterable (list, array) of the embedding
similarities = np.array([cosine_similarity(query_embedding_2d, np.array(topic_embedding).reshape(1, -1))[0][0] for topic_embedding in df['embedding']])

In [51]:
df['similarity'] = similarities
sorted_df = df.sort_values(by='similarity', ascending=False)
top_topics = sorted_df.head(10)
top_topics

Unnamed: 0,id,embedding_input,n_tokens,embedding,similarity
149,https://openalex.org/T10462,Topic: Reinforcement Learning Algorithms; Desc...,101,"[0.0038641353603452444, -0.04515691474080086, ...",0.382175
179,https://openalex.org/T12072,Topic: Active Learning in Machine Learning Res...,105,"[-0.03133949637413025, 0.019223827868700027, 0...",0.381579
172,https://openalex.org/T11598,Topic: Machine Learning for Internet Traffic C...,94,"[-0.002179244067519903, -0.014644972048699856,...",0.376717
245,https://openalex.org/T12676,Topic: Theory and Applications of Extreme Lear...,109,"[-0.019221412017941475, -0.018300073221325874,...",0.367894
251,https://openalex.org/T11612,Topic: Optimization Methods in Machine Learnin...,122,"[-0.006085216999053955, -0.025157243013381958,...",0.366122
66,https://openalex.org/T11122,Topic: Educational Data Mining and Learning An...,129,"[0.012617486529052258, 0.00832311436533928, 0....",0.356867
225,https://openalex.org/T14351,Topic: Inductive Modeling in Scientific Resear...,132,"[-0.007745642215013504, 0.018768977373838425, ...",0.352353
266,https://openalex.org/T12794,Topic: Adaptive Dynamic Programming for Optima...,129,"[-0.019769232720136642, -0.029867352917790413,...",0.352197
134,https://openalex.org/T11276,Topic: Machine Learning Methods for Solar Radi...,108,"[-0.03454571217298508, -0.0027602065820246935,...",0.348439
105,https://openalex.org/T13018,Topic: Machine Learning for Earthquake Early W...,120,"[-0.01907801628112793, -0.013615892268717289, ...",0.348083
