In [2]:
# Libraries used to train models & manipulate data
import numpy as np
import pandas as pd

# Optionally recalculate the tsne 2D visualization of the career map (it takes a while)
recalc_tsne = True
visual_dims = 2
# Optionally save the data
file_save = True
file_name = f'tfidf_visualized_{visual_dims}.2.csv'
# Remove values that only appear rep_times
rep_times = 5

### Kaggle import: https://github.com/Kaggle/kaggle-api
# kaggle datasets download -f dump.csv --unzip killbot/linkedin-profiles-and-jobs-data
pos = pd.read_csv(r'../dump_cleaned.csv')
# Filter for useful entries 
pos = pos.filter(items=['posTitle'])
# Convert to a series
pos = pd.Series(pos['posTitle'])

print(pos.shape)
pos.head(3)

(15943,)


0             project manager
1             project manager
2    senior marketing manager
Name: posTitle, dtype: object

In [22]:
# Vectorize position titles using tf-idf (Term frequency -> inverse document frequency)

from sklearn.feature_extraction.text import TfidfVectorizer
# 0.1 -> 2    | 0.01 -> 58    | 0.001 -> 407    | 0 -> 6738
v = TfidfVectorizer(min_df=0.001) # 0.01 without cell above, 0.001 with
x = v.fit_transform(pos)
vecs = pd.DataFrame(x.toarray())
np.savetxt('tfidf_positions.csv', vecs, delimiter=",") # Save it for other files

vecs.shape

(727, 373)

In [23]:
# Collapse NxM vecotrs to Nx2 in order to visual data
# Principal component analysis (PCA), vs t-Distributed Stochastic Neighbour Embedding (t-SNE)
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# https://nbviewer.jupyter.org/urls/gist.githubusercontent.com/AlexanderFabisch/1a0c648de22eff4a2a3e/raw/59d5bc5ed8f8bfd9ff1f7faa749d1b095aa97d5a/t-SNE.ipynb
if (recalc_tsne):
    from sklearn.manifold import TSNE

    # Collapse matrix into Nx2
    tsne = TSNE(n_components=visual_dims, random_state=0, verbose=1)
    vec_visual = tsne.fit_transform(vecs)

    # Save visualization to a csv
    if file_save:
        np.savetxt(file_name, vec_visual, delimiter=",")

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 727 samples in 0.043s...
[t-SNE] Computed neighbors for 727 samples in 0.379s...
[t-SNE] Computed conditional probabilities for sample 727 / 727
[t-SNE] Mean sigma: 0.398299
[t-SNE] KL divergence after 250 iterations with early exaggeration: 73.752243
[t-SNE] KL divergence after 1000 iterations: 1.435389


In [25]:
import plotly.graph_objects as go
import plotly.express as px
# Recalculate if necessary
if not recalc_tsne:
    vec_visual = pd.read_csv(file_name, header=None).to_numpy()

# 2D Visualizer
if (visual_dims == 2):
    # Display data
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=vec_visual[:,0], y=vec_visual[:,1],
        text=pos,
        mode='markers',
        marker_color='rgba(255, 182, 193, .8)'
    ))
    fig.update_layout(title=f'TF-IDF Career Map: {len(pos)} Jobs')

# 3D Visualizer
else:
    fig = px.scatter_3d(
        vec_visual,
        x=0,
        y=1,
        z=2,
    )

fig.show()