In [63]:
import pickle
import numpy as np
import pandas as pd
import plotly.express as ex
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [7]:
df = pd.read_csv("data/preprocessed.csv")
df

Unnamed: 0,q,d,s,o,gender,birth,occupation
0,even quantum cryptography does not solve all o...,2008-10-18 02:33:56,bruce schneier,1,male,['+1963-01-15T00:00:00Z'],cryptographer
1,this is good work great cryptography i love th...,2008-12-30 22:21:35,bruce schneier,2,male,['+1963-01-15T00:00:00Z'],cryptographer
2,people can foresee the future only when it coi...,2008-11-04 05:14:51,george orwell,6,male,[None],
3,the mathematics of ciphers number theory rsa ...,2008-11-21 08:38:20,bruce schneier,1,male,['+1963-01-15T00:00:00Z'],cryptographer
4,can disarm the normal interest groups and othe...,2008-11-06 06:51:34,barack obama,4,male,['+1961-08-04T00:00:00Z'],politician
...,...,...,...,...,...,...,...
7806,one of the main problems with bitcoin is that ...,2017-11-13 11:51:41,hussein sayed,31,male,['+1991-09-18T00:00:00Z'],investor
7807,the only way it has value is if the next guy i...,2017-11-27 09:42:09,neil wilson,7,male,['1978-06-26'],ceo
7808,we are expanding our blockchain teams across e...,2017-11-15 22:00:00,paul brody,3,male,['+1961-01-01T00:00:00Z'],trumpeter
7809,cryptocurrencies are in an exponential bubble ...,2017-12-28 12:12:00,ron paul,1,male,['+1935-08-20T00:00:00Z'],politician


In [8]:
with open("data/embeds.pkl", "rb") as f:
    embeds = pickle.load(f)
embeds.shape

(7811, 384)

In [30]:
def get_emb(x):
    return embeds[x.name]

df["embd"] = df.apply(lambda x: get_emb(x), axis=1)
df["occupation"] = df["occupation"].fillna('None')
df

Unnamed: 0,q,d,s,o,gender,birth,occupation,embd
0,even quantum cryptography does not solve all o...,2008-10-18 02:33:56,bruce schneier,1,male,['+1963-01-15T00:00:00Z'],cryptographer,"[-0.043082796, -0.018537337, -0.031235695, 0.0..."
1,this is good work great cryptography i love th...,2008-12-30 22:21:35,bruce schneier,2,male,['+1963-01-15T00:00:00Z'],cryptographer,"[-0.062056527, 0.047530353, -0.102777876, -0.0..."
2,people can foresee the future only when it coi...,2008-11-04 05:14:51,george orwell,6,male,[None],,"[0.007510899, 0.027965602, 0.0928818, 0.030693..."
3,the mathematics of ciphers number theory rsa ...,2008-11-21 08:38:20,bruce schneier,1,male,['+1963-01-15T00:00:00Z'],cryptographer,"[-0.06203978, 0.05027579, -0.06726297, 0.01422..."
4,can disarm the normal interest groups and othe...,2008-11-06 06:51:34,barack obama,4,male,['+1961-08-04T00:00:00Z'],politician,"[-0.022280598, 0.03585137, 0.029202083, 0.0018..."
...,...,...,...,...,...,...,...,...
7806,one of the main problems with bitcoin is that ...,2017-11-13 11:51:41,hussein sayed,31,male,['+1991-09-18T00:00:00Z'],investor,"[0.023816716, 0.0022417386, -0.04340269, -0.02..."
7807,the only way it has value is if the next guy i...,2017-11-27 09:42:09,neil wilson,7,male,['1978-06-26'],ceo,"[-0.006214832, 0.0041340687, -0.05988493, -0.0..."
7808,we are expanding our blockchain teams across e...,2017-11-15 22:00:00,paul brody,3,male,['+1961-01-01T00:00:00Z'],trumpeter,"[0.032797523, -0.062365446, -0.025915723, -0.0..."
7809,cryptocurrencies are in an exponential bubble ...,2017-12-28 12:12:00,ron paul,1,male,['+1935-08-20T00:00:00Z'],politician,"[-0.01813105, -0.016683614, -0.05587506, 0.031..."


In [37]:
embeddings = embeds
# standartized_embeddings = (embeddings - embeddings.mean()) / embeddings.std()
components = PCA(n_components=2).fit_transform(embeddings)

components.shape

(7811, 2)

In [39]:
ex.scatter(components[:, 0], components[:, 1], color=df.occupation)

---

### T-SNE

---

In [40]:
from sklearn.manifold import TSNE

components_tsne = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(embeddings)
components_tsne.shape

(7811, 2)

In [41]:
ex.scatter(components_tsne[:, 0], components_tsne[:, 1], color=df.occupation)

---

Maybe we will see something better if we leave only top 10 of the most popular ocupations

In [51]:
N_OCCUPATIONS = 5
occupations_top_n = df.groupby('occupation').sum().sort_values('o', ascending=False).index[:N_OCCUPATIONS]
occupations_top_n

Index(['economist', 'businessperson', 'entrepreneur', 'scientist', 'director'], dtype='object', name='occupation')

In [59]:
df_occupations_top_n = df[df.occupation.apply(lambda x: x in occupations_top_n)]
df_occupations_top_n.head()

Unnamed: 0,q,d,s,o,gender,birth,occupation,embd
7,cryptography has become an important part of o...,2008-10-15 18:41:21,richard brown,1,male,['1916-01-01'],director,"[0.015955096, 0.05288954, -0.034609336, -0.100..."
9,cryptography has become an important part of o...,2008-10-14 16:28:43,richard brown,1,male,['1916-01-01'],director,"[-0.003245669, 0.022950932, -0.047379404, -0.0..."
11,we will be using strong authentication methods...,2008-10-27 16:05:08,vint cerf,1,male,['+1943-06-23T00:00:00Z'],scientist,"[-0.08502854, 0.062946305, -0.038307853, -0.05..."
21,one of big hurdles is that this is something y...,2009-07-06 23:50:49,dan kaminsky,2,male,['+1979-00-00T00:00:00Z'],scientist,"[-0.01391253, 0.011913197, -0.11620632, -0.003..."
23,so why hasnt the nobel foundation bowed to fem...,2009-10-08 06:03:47,larry summers,1,male,['+1954-11-30T00:00:00Z'],economist,"[0.018347966, -0.032968037, -0.013841194, 0.08..."


In [67]:
embeddings_occupations_top_n = np.array([e for e in df_occupations_top_n.embd.values])
embeddings_occupations_top_n.shape

(3205, 384)

In [68]:
components_tsne = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(embeddings_occupations_top_n)
components_tsne.shape

(3205, 2)

In [69]:
ex.scatter(components_tsne[:, 0], components_tsne[:, 1], color=df_occupations_top_n.occupation)