# Imports

In [1]:
import pandas as pd
import numpy as np
import openai
import os
import re
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Load Data

Read in your tabular data file

In [2]:
# Path to datafile
df = pd.read_excel('data/agenda_export_spspa_202402 MJ.xls', sheet_name=0)

# Process Data

Write whatever code is necessary for your own data

In [3]:
# Filter for poster sessions
df_posters = df.loc[df['Tracks'].str.contains('Poster', na=False) & 
                    ~df['*Session Title'].str.contains('Poster Session', na=False) &
                    df['Authors'].notna()].reset_index(drop=True)

# Remove rows with no description
df_posters = df_posters.dropna(subset=['Description'])

# Edit titles to remove the initial "[number]" portion
df_posters['*Session Title'] = [re.sub(r"^\[\d+\]", "", title).strip() for title in df_posters['*Session Title']]

# Get Embeddings

You will need an API key from OpenAI for this.

Here is a [quickstart guide](https://platform.openai.com/docs/quickstart?context=python).

Also you can choose a different embedding model - we are using `text-embedding-3-small` which is current as of Februrary 2024.

More info on the model [here](https://openai.com/blog/new-embedding-models-and-api-updates).

OpenAI also has an [Embeddings Guide](https://platform.openai.com/docs/guides/embeddings).

In [6]:
# Access the API key from an environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

from openai import OpenAI
client = OpenAI()

# Function to get embeddings
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# Combine Title and Description
df_posters['Description'] = df_posters['Description'].astype(str)
df_posters['combined'] = df_posters['*Session Title'] + ' ' + df_posters['Description']

# Get embeddings (this can take some time)
df_posters['embedding_3small'] = df_posters.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))

In [8]:
# Create and save embeddings matrix
for index, row in df_posters.iterrows():
    if index ==0:
        embeddings_array = np.array(df_posters['embedding_3small'][index])
    if index >0:
        embeddings_array = np.vstack((embeddings_array, np.array(df_posters['embedding_3small'][index])))

np.savez('output/embeddings_array.npz', embeddings_array = embeddings_array)

# Dimensionality Reduction

You can use whichever methods you prefer for this.

We also just used the default settings but cleaner seperation could well be possible by tweaking the hyperparameters.

This is more of an aesthetic choice, as the actual similarity scores that are used for finding matching papers are using a different method (cosine similarity) on the full embedding vector of 1536 dimensions.

In [9]:
# Reduce to 2 dimensional data and save these values to df
# PCA
pca = PCA(n_components=2)
embeddings_2d_pca = pca.fit_transform(embeddings_array)

# Add to df
df_posters['pca1'] = embeddings_2d_pca[:,0]
df_posters['pca2'] = embeddings_2d_pca[:,1]

# t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d_tsne = tsne.fit_transform(embeddings_array)

# Add to df
df_posters['tSNE1'] = embeddings_2d_tsne[:,0]
df_posters['tSNE2'] = embeddings_2d_tsne[:,1]

# Save dataframe
df_posters.to_csv('output/spsp_wEmbeddings.csv', index=False)