In [35]:
# Libraries used to train models & manipulate data
import numpy as np
import pandas as pd

# Optionally recalculate the tsne 2D visualization of the career map (it takes a while)
recalc_tsne = True
visual_dims = 2

### Kaggle import: https://github.com/Kaggle/kaggle-api
# kaggle datasets download -f dump.csv --unzip killbot/linkedin-profiles-and-jobs-data
pos = pd.read_csv(r'./dump.csv')
# Filter for useful entries 
pos = pos.filter(items=['posTitle'])
print(pos.shape)
pos.head(3)

(39537, 1)


Unnamed: 0,posTitle
0,Portfolio Executive
1,Solution Delivery Executive
2,Project Manager


In [36]:
# Remove values that only appear once
pos = pos[pos.groupby('posTitle')['posTitle'].transform(len) > 1]
# Now drop duplicate entries
pos = pos.drop_duplicates()
print(pos.shape)
pos.head(3)

(2817, 1)


Unnamed: 0,posTitle
2,Project Manager
4,Senior Marketing Manager
11,Marketing Intern


In [37]:
# Vectorize position titles using tf-idf (Term frequency -> inverse document frequency)

from sklearn.feature_extraction.text import TfidfVectorizer
# 0.1 -> 2    | 0.01 -> 58    | 0.001 -> 407    | 0 -> 6738
v = TfidfVectorizer(min_df=0.001) # 0.01 without cell above, 0.001 with
x = v.fit_transform(pos['posTitle'])
vecs = pd.DataFrame(x.toarray())
np.savetxt('tfidf_positions.csv', vecs, delimiter=",") # Save it for other files

vecs.shape

(2817, 403)

In [38]:
# Collapse NxM vecotrs to Nx2 in order to visual data
# Principal component analysis (PCA), vs t-Distributed Stochastic Neighbour Embedding (t-SNE)
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/AlexanderFabisch/1a0c648de22eff4a2a3e/raw/59d5bc5ed8f8bfd9ff1f7faa749d1b095aa97d5a/t-SNE.ipynb
if (recalc_tsne):
    from sklearn.manifold import TSNE

    # Collapse matrix into Nx2
    tsne = TSNE(n_components=visual_dims, random_state=0, verbose=2)
    vec_visual = tsne.fit_transform(vecs)

    # Save visualization to a csv
    np.savetxt(f'tfidf_visualized_{visual_dims}.csv', vec_visual, delimiter=",")

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2817 samples in 0.395s...
[t-SNE] Computed neighbors for 2817 samples in 6.932s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2817
[t-SNE] Computed conditional probabilities for sample 2000 / 2817
[t-SNE] Computed conditional probabilities for sample 2817 / 2817
[t-SNE] Mean sigma: 0.000000
[t-SNE] Computed conditional probabilities in 0.137s
[t-SNE] Iteration 50: error = 89.9907532, gradient norm = 0.2293891 (50 iterations in 1.161s)
[t-SNE] Iteration 100: error = 89.0237198, gradient norm = 0.2373024 (50 iterations in 1.140s)
[t-SNE] Iteration 150: error = 90.7230301, gradient norm = 0.2091872 (50 iterations in 1.066s)
[t-SNE] Iteration 200: error = 90.0279846, gradient norm = 0.2130039 (50 iterations in 1.063s)
[t-SNE] Iteration 250: error = 90.7395859, gradient norm = 0.2091197 (50 iterations in 1.185s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 90.739586
[t-SNE] Iteration 300: erro

In [39]:
import plotly.graph_objects as go
import plotly.express as px
# Recalculate if necessary
if (not recalc_tsne):
    vec_visual = pd.read_csv(f'tfidf_visualized_{visual_dims}.csv', header=None).to_numpy()

# 2D Visualizer
if (visual_dims == 2):
    # Display data
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=vec_visual[:,0], y=vec_visual[:,1],
        text=pos['posTitle'],
        mode='markers',
        marker_color='rgba(255, 182, 193, .8)'
    ))
    fig.update_layout(title='TF-IDF Initial Career Map')

# 3D Visualizer
else:
    fig = px.scatter_3d(
        vec_visual,
        x=0,
        y=1,
        z=2,
    )

fig.show()