In [31]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Pickling works, maybe use a json? https://datatofish.com/export-pandas-dataframe-json/
train = pd.read_json('w2v_train.json')
train.head(3)

Unnamed: 0,posTitle,memberUrn,posEncoded
0,.,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2018
1,.NET Developer,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",22
2,.NET Software Developer,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2682


In [32]:
w2v_inner_dim = 10
num_users = len(train['memberUrn'][0])
num_jobs = train.shape[0]
# Simple 2 layer model to create the word2vec matrix
# Predicting word (job) based off context (members who had that job) currently
# people --> job
model = keras.models.Sequential([
    keras.layers.Dense(w2v_inner_dim, input_dim=num_users),
    keras.layers.Dense(num_jobs, activation='softmax')
])

# Hyper parameters
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 10)                59550     
_________________________________________________________________
dense_11 (Dense)             (None, 2817)              30987     
Total params: 90,537
Trainable params: 90,537
Non-trainable params: 0
_________________________________________________________________


In [33]:
history = model.fit(
    np.array(train['memberUrn'].tolist(), dtype=np.float32),
    np.array(train['posEncoded'].tolist(), dtype=np.float32),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=3)
    ],
    epochs=200,
    shuffle=True,
    verbose=1
)
history.history['accuracy'][-1] * 100

Train on 2817 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200

91.08980894088745

In [34]:
w2v_matrix = pd.DataFrame(model.get_weights()[2]).transpose()
w2v_matrix = pd.concat([train['posTitle'], w2v_matrix], axis=1)

w2v_matrix.to_json('pos_w2v_matrix.json')
w2v_matrix.head()

Unnamed: 0,posTitle,0,1,2,3,4,5,6,7,8,9
0,.,-0.445736,0.454666,-0.485675,0.315932,-0.442622,0.462069,0.442422,0.459545,-0.463913,-0.444833
1,.NET Developer,0.750445,0.594099,-0.128781,-0.330954,0.545818,0.657608,0.752742,0.594418,0.514319,-0.426295
2,.NET Software Developer,0.76599,-0.67501,-0.67905,0.20599,-0.438756,0.427324,0.12299,0.640065,-0.092565,-0.586531
3,.NET Technical Lead,-0.508125,-0.561494,0.355149,-0.505412,-0.031951,-0.519004,0.669519,-0.652075,-0.816302,0.680511
4,.Net Developer,0.376619,0.005014,0.685517,-0.79024,0.538858,-0.798129,0.479292,-0.71133,-0.775852,0.706027


In [35]:
from sklearn.manifold import TSNE

# get_weights returns weights & biases -> we want the 2nd matrix of weights (w2v_inner_dim by # of jobs)
w2v_matrix = pd.read_json('pos_w2v_matrix.json').drop(['posTitle'], axis=1)

# Collapse matrix into Nx2
tsne = TSNE(n_components=2, random_state=0, verbose=1)
w2v_visual = tsne.fit_transform(w2v_matrix)
w2v_matrix.shape

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2817 samples in 0.012s...
[t-SNE] Computed neighbors for 2817 samples in 0.271s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2817
[t-SNE] Computed conditional probabilities for sample 2000 / 2817
[t-SNE] Computed conditional probabilities for sample 2817 / 2817
[t-SNE] Mean sigma: 0.627204
[t-SNE] KL divergence after 50 iterations with early exaggeration: 84.008644
[t-SNE] KL divergence after 1000 iterations: 2.467782


(2817, 10)

In [36]:
import plotly.graph_objects as go
# Display data
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=w2v_visual[:,0], y=w2v_visual[:,1],
    text=train['posTitle'], # ! LABELS ARE WRONG
    mode='markers',
    marker_color='rgba(255, 182, 193, .8)'
))
fig.update_layout(title='Word2Vec 2D Career Map')