In [9]:
import pickle
import numpy as np
import pandas as pd
import plotly.express as ex
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [10]:
RANDOM_STATE = 21

In [11]:
df = pd.read_csv("data/preprocessed.csv")
with open("data/embeds.pkl", "rb") as f:
    embeds = pickle.load(f)

def get_emb(x):
    return embeds[x.name]

df["embd"] = df.apply(lambda x: get_emb(x), axis=1)
df["birth"] = df.apply(lambda x: None if x["birth"] == "[None]" else x["birth"], axis=1)
df = df[~df.isna().any(axis=1)]
df["birth"] = df.apply(lambda x: x["birth"][3:-18] if len(x["birth"]) > 14 else x["birth"][2:-8], axis=1)
df["birth"] = df.apply(lambda x: int(x["birth"][2:]), axis=1)
df["d"] = df.apply(lambda x: x["d"][:-12][2:], axis=1)
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
df = df[df['o'] > 7]

pure_embeddings = np.stack(df["embd"].values)

df.head()

Unnamed: 0,q,d,s,o,gender,birth,occupation,embd
1,i firmly believe that bitcoin and the blockcha...,16-05,craig wright,35,male,70,business,"[0.0065686703, 0.020536436, -0.029846523, -0.0..."
15,bitcoin lacks the one thing that makes a good ...,17-07,ewald nowotny,19,male,44,politician,"[-0.07000421, -0.019935759, -0.03778296, 0.000..."
22,the legitimacy this gives bitcoin as a tradeab...,17-11,neil wilson,10,male,78,business,"[-0.022904888, 0.033875033, -0.012582921, -0.0..."
44,we are putting everything into bitcoin weve so...,17-10,didi taihuttu,12,male,78,business,"[-0.04277336, -0.056315128, -0.0167899, 0.0054..."
49,in fact because the extortion emails reuse bit...,16-04,matthew prince,20,male,74,business,"[-0.031868592, 0.05459727, -0.017337315, -0.06..."


---

### T-SNE dimension reduction

In [12]:
from sklearn.manifold import TSNE

components_tsne = TSNE(n_components=2, random_state=RANDOM_STATE).fit_transform(pure_embeddings)
components_tsne.shape


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.


The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.



(545, 2)

---

### Clustering

In [14]:
from sklearn.cluster import DBSCAN, KMeans
import numpy as np

clustering = KMeans(n_clusters=7, random_state=RANDOM_STATE).fit(pure_embeddings)
fig = ex.scatter(x=components_tsne[:, 0], y=components_tsne[:, 1], color=clustering.labels_, size=df.o.values)

fig.write_html('docs/_includes/embeddings.html')

fig.show()