In [35]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

train = pd.read_json('w2v_train.json')
train.head(3)

Unnamed: 0,posTitle,posEncoded,members
0,academic tutor,76,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,account coordinator,526,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,account director,416,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [36]:
w2v_inner_dim = 10
num_users = len(train['members'][0])
num_jobs = train.shape[0]
# Simple 2 layer model to create the word2vec matrix
# Predicting word (job) based off context (members who had that job) currently
# people --> job
model = keras.models.Sequential([
    keras.layers.Dense(w2v_inner_dim, input_dim=num_users),
    keras.layers.Dense(num_jobs, activation='softmax')
])

# Hyper parameters
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 10)                59800     
_________________________________________________________________
dense_9 (Dense)              (None, 538)               5918      
Total params: 65,718
Trainable params: 65,718
Non-trainable params: 0
_________________________________________________________________


In [37]:
history = model.fit(
    np.array(train['members'].tolist()), # dtype=np.float32
    np.array(train['posEncoded'].tolist()),
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor='accuracy', min_delta=0.0001, patience=10)
    ],
    epochs=200,
    shuffle=True,
    verbose=0
)
history.history['accuracy'][-1] * 100

99.81412887573242

In [38]:
w2v_matrix = pd.DataFrame(model.get_weights()[2]).transpose()
w2v_matrix = pd.concat([train['posTitle'], w2v_matrix], axis=1)

w2v_matrix.to_json('pos_w2v_matrix.json')
w2v_matrix.head()

Unnamed: 0,posTitle,0,1,2,3,4,5,6,7,8,9
0,academic tutor,0.314774,0.366791,-0.361189,-0.376355,0.50421,0.421843,-0.299089,0.322352,0.387822,0.247664
1,account coordinator,-0.443244,0.187873,0.381465,0.382059,-0.387396,0.474691,0.413804,0.367517,-0.417806,-0.380789
2,account director,0.248015,0.344462,-0.025905,-0.453014,-0.305691,-0.20972,-0.302719,0.484575,0.326791,0.245828
3,account executive,-0.420903,0.529688,0.340185,0.229981,-0.168508,0.612626,-0.362432,-0.405646,0.374507,0.483424
4,account manager,0.294179,0.30962,-0.268075,0.293654,-0.343437,-0.171601,0.419101,0.270469,0.315768,-0.254636


In [39]:
from sklearn.manifold import TSNE

# get_weights returns weights & biases -> we want the 2nd matrix of weights (w2v_inner_dim by # of jobs)
w2v_matrix = pd.read_json('pos_w2v_matrix.json').drop(['posTitle'], axis=1)

# Collapse matrix into Nx2
tsne = TSNE(n_components=2, random_state=0, verbose=1)
w2v_visual = tsne.fit_transform(w2v_matrix)
w2v_matrix.shape

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 538 samples in 0.002s...
[t-SNE] Computed neighbors for 538 samples in 0.043s...
[t-SNE] Computed conditional probabilities for sample 538 / 538
[t-SNE] Mean sigma: 0.468526
[t-SNE] KL divergence after 250 iterations with early exaggeration: 68.635750
[t-SNE] KL divergence after 1000 iterations: 1.874615


(538, 10)

In [40]:
import plotly.graph_objects as go
# Display data
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=w2v_visual[:,0], y=w2v_visual[:,1],
    text=train['posTitle'], # ! LABELS ARE WRONG
    mode='markers',
    marker_color='rgba(255, 182, 193, .8)'
))
fig.update_layout(title='Word2Vec 2D Career Map')