In [76]:
import pandas as pd
import numpy as np
import plotly.express as px
import openai
from openai.embeddings_utils import get_embedding
from sklearn.manifold import TSNE

with open('openai_secret_key') as r:
    openai_secret_key = r.readline().strip()
    openai.api_key = openai_secret_key

In [36]:
input_path = 'interests_raw.txt'
with open(input_path) as f:
    interests = pd.read_csv(f)

In [26]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [38]:
interests.name = interests.name.apply(str.lower)
interests.drop_duplicates('name', inplace=True)

In [85]:
interests = pd.read_csv('interests_clean.csv')

In [89]:
embeddings = openai.Embedding.create(input=interests.name.tolist(), model=embedding_model)['data']
interests['embedding'] = [e['embedding'] for e in embeddings]

In [92]:
embeddings_matrix = np.array(interests.embedding.values.tolist())
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(embeddings_matrix)
interests[['vis_dim1', 'vis_dim2']] = vis_dims

In [94]:
fig = px.scatter(interests, x='vis_dim1', y='vis_dim2', hover_data='name')
fig.show()

In [95]:
interests

Unnamed: 0,name,embedding,vis_dim1,vis_dim2
0,history,"[0.010230157524347305, -0.023113589733839035, ...",-17.706579,9.441795
1,politics,"[-0.0014740151818841696, -0.024872785434126854...",-16.790216,-28.014124
2,war,"[-0.006708966102451086, -0.01935657113790512, ...",2.550614,13.158960
3,ideology,"[-0.00860591884702444, -0.019965188577771187, ...",11.059896,-33.604469
4,economics,"[-0.00599592225626111, -0.03246382251381874, 0...",-40.777351,-17.398243
...,...,...,...,...
101,chess,"[3.687150456244126e-05, -0.004298575222492218,...",7.149445,-135.571487
102,strategy,"[-0.012461233884096146, -0.031092524528503418,...",24.798500,29.333107
103,tactics,"[-0.024994798004627228, -0.009694055654108524,...",29.466881,25.287920
104,spontaneity,"[-0.009901979006826878, -0.015062354505062103,...",50.413239,-27.376392
