In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

train = pd.read_json('./w2v_train.json')
train.head(3)

Unnamed: 0,posTitle,memberUrn,posEncoded
0,.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2018
1,.NET Developer,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",22
2,.NET Software Developer,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2682


In [38]:
w2v_inner_dim = 10
num_users = len(train['memberUrn'][0])
num_jobs = train.shape[0]
# Simple 2 layer model to create the word2vec matrix
# Predicting word (job) based off context (members who had that job) currently
# people --> job
model = keras.models.Sequential([
    keras.layers.Dense(w2v_inner_dim, input_dim=num_users),
    keras.layers.Dense(num_jobs, activation='softmax')
])

# Hyper parameters
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 10)                59550     
_________________________________________________________________
dense_13 (Dense)             (None, 2817)              30987     
Total params: 90,537
Trainable params: 90,537
Non-trainable params: 0
_________________________________________________________________


In [39]:
history = model.fit(
    np.array(train['memberUrn'].tolist()), # dtype=np.float32
    np.array(train['posEncoded'].tolist()),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=3)
    ],
    epochs=200,
    shuffle=True,
    verbose=0
)
history.history['accuracy'][-1] * 100

92.43876338005066

In [40]:
w2v_matrix = pd.DataFrame(model.get_weights()[2]).transpose()
w2v_matrix = pd.concat([train['posTitle'], w2v_matrix], axis=1)

w2v_matrix.to_json('pos_w2v_matrix.json')
w2v_matrix.head()

Unnamed: 0,posTitle,0,1,2,3,4,5,6,7,8,9
0,.,-0.439774,0.47841,0.451135,0.466166,-0.245539,0.491956,-0.421317,-0.3832,-0.521391,-0.478627
1,.NET Developer,-0.719298,-0.747874,-0.021224,0.571161,-0.369559,0.617842,-0.367301,0.55301,-0.569342,0.36459
2,.NET Software Developer,-0.525463,-0.404437,-0.42626,-0.40745,0.201355,0.661924,-0.727341,0.371876,-0.634294,0.300511
3,.NET Technical Lead,-0.919465,0.574483,-0.748855,-0.623329,-0.220682,0.59439,0.541288,-0.715966,0.376393,0.291046
4,.Net Developer,-0.688979,0.635843,-0.597637,-0.725636,-0.446392,-0.425806,0.722211,-0.51382,0.7315,0.581584


In [41]:
from sklearn.manifold import TSNE

# get_weights returns weights & biases -> we want the 2nd matrix of weights (w2v_inner_dim by # of jobs)
w2v_matrix = pd.read_json('pos_w2v_matrix.json').drop(['posTitle'], axis=1)

# Collapse matrix into Nx2
tsne = TSNE(n_components=2, random_state=0, verbose=1)
w2v_visual = tsne.fit_transform(w2v_matrix)
w2v_matrix.shape

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2817 samples in 0.012s...
[t-SNE] Computed neighbors for 2817 samples in 0.350s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2817
[t-SNE] Computed conditional probabilities for sample 2000 / 2817
[t-SNE] Computed conditional probabilities for sample 2817 / 2817
[t-SNE] Mean sigma: 0.630908
[t-SNE] KL divergence after 50 iterations with early exaggeration: 83.996262
[t-SNE] KL divergence after 1000 iterations: 2.503616


(2817, 10)

In [42]:
import plotly.graph_objects as go
# Display data
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=w2v_visual[:,0], y=w2v_visual[:,1],
    text=train['posTitle'], # ! LABELS ARE WRONG
    mode='markers',
    marker_color='rgba(255, 182, 193, .8)'
))
fig.update_layout(title='Word2Vec 2D Career Map')