In [1]:
# Libraries used to train models & manipulate data
import numpy as np
import pandas as pd

# Recalculate the tsne 2-3D visualization of the career map (it takes a while)
visual_dims = 2
df = pd.read_csv('tfidf_positions.csv')
vecs = df.drop('posTitle', axis=1)

print(vecs.shape)
vecs.head(3)

(1632, 351)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,341,342,343,344,345,346,347,348,349,350
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [2]:
# Collapse NxM vecotrs to Nx2 in order to visual data
# Principal component analysis (PCA), vs t-Distributed Stochastic Neighbour Embedding (t-SNE)
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/AlexanderFabisch/1a0c648de22eff4a2a3e/raw/59d5bc5ed8f8bfd9ff1f7faa749d1b095aa97d5a/t-SNE.ipynb
from sklearn.manifold import TSNE

# Collapse matrix into Nx2
tsne = TSNE(n_components=visual_dims, random_state=0, verbose=1)
vec_visual = tsne.fit_transform(vecs)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1632 samples in 0.108s...
[t-SNE] Computed neighbors for 1632 samples in 1.901s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1632
[t-SNE] Computed conditional probabilities for sample 1632 / 1632
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.642548
[t-SNE] KL divergence after 1000 iterations: 1.560636


In [3]:
import plotly.graph_objects as go
import plotly.express as px

num_jobs = df['posTitle'].shape[0]

# 2D Visualizer
if (visual_dims == 2):
    # Display data
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=vec_visual[:,0], y=vec_visual[:,1],
        text=df['posTitle'],
        mode='markers',
        marker_color='rgba(255, 182, 193, .8)'
    ))
    fig.update_layout(title=f'TF-IDF Career Map: {num_jobs} Jobs')

# 3D Visualizer
else:
    fig = px.scatter_3d(
        vec_visual,
        x=0,
        y=1,
        z=2,
    )

fig.show()