In [25]:
# Libraries used to train models & manipulate data
import numpy as np
import pandas as pd

# Recalculate the tsne 2-3D visualization of the career map (it takes a while)
data = pd.read_csv('tfidf_positions.csv')
vecs = data.drop('posTitle', axis=1)

print(vecs.shape)
vecs.head(3)

(1632, 344)


Unnamed: 0,academic,account,accountant,accounts,adjunct,administration,administrative,administrator,adviser,advisor,...,vice,video,visiting,visual,volunteer,vp,web,work,worker,writer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# Collapse NxM vecotrs to Nx2 in order to visual data
# Principal component analysis (PCA), vs t-Distributed Stochastic Neighbour Embedding (t-SNE)
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/AlexanderFabisch/1a0c648de22eff4a2a3e/raw/59d5bc5ed8f8bfd9ff1f7faa749d1b095aa97d5a/t-SNE.ipynb
from sklearn.manifold import TSNE

# Collapse matrix into Nx2
tsne = TSNE(n_components=visual_dims, random_state=0, verbose=1)
vec_visual = tsne.fit_transform(vecs)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1632 samples in 0.117s...
[t-SNE] Computed neighbors for 1632 samples in 1.800s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1632
[t-SNE] Computed conditional probabilities for sample 1632 / 1632
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.808464
[t-SNE] KL divergence after 1000 iterations: 1.528548


In [29]:
# Add color
dump = pd.read_csv('../dump_cleaned.csv')
# Join the visualization vectors
vec_vis = pd.DataFrame(vec_visual, columns=['x', 'y'])
df = data.join(vec_vis)
# Join the tfidf categories
df = pd.merge(df, dump, on='posTitle')
df = df.filter(['posTitle', 'x', 'y', 'tfidfKey', 'tfidfKeyNum'])
df = df.drop_duplicates()
df.head(3)

Unnamed: 0,posTitle,x,y,tfidfKey,tfidfKeyNum
0,guest speaker,29.912535,35.412914,guest speaker,39
2,graduate engineer,-11.897299,33.614655,graduate engineer,381
52,senior sales assistant,29.175669,-1.665344,sales assistant,423


In [39]:
import plotly.express as px

d1 = df['posTitle'].nunique()
d2 = df['tfidfKey'].nunique()

# Display data
fig = px.scatter(df,
    x='x', y='y',
    hover_name='posTitle',
    color='tfidfKeyNum',
    hover_data=['tfidfKey']
)
fig.update_layout(title=f'TF-IDF Career Map: {d1} Jobs | {d2} Groups')

fig.show()