In [1]:
# Libraries used to train models & manipulate data
import numpy as np
import pandas as pd

### Kaggle import: https://github.com/Kaggle/kaggle-api
# kaggle datasets download -f dump.csv --unzip killbot/linkedin-profiles-and-jobs-data
members = pd.read_csv(r'./dump.csv')
pos_vecs = pd.read_csv(r'./tfidf_positions.csv', header=None) # NN input data
# Filter for useful entries 
members = members.filter(items=['memberUrn', 'posTitle'])
members.head(3)

Unnamed: 0,memberUrn,posTitle
0,urn:li:member:10013136,Portfolio Executive
1,urn:li:member:10013136,Solution Delivery Executive
2,urn:li:member:10013136,Project Manager


In [2]:
# Simplify the member id
members['memberUrn'] = members['memberUrn'].apply(lambda x: int(x.split(':')[-1]))

print(members.shape)
members.head(3)

(39537, 2)


Unnamed: 0,memberUrn,posTitle
0,10013136,Portfolio Executive
1,10013136,Solution Delivery Executive
2,10013136,Project Manager


In [3]:
# Group members by position
members_grouped = members.groupby('posTitle')['memberUrn'].apply(list) # 2817 jobs w/ more than 1 member
# Series -> DataFrame & Reset indices
members_grouped = members_grouped.to_frame()
members_grouped = members_grouped.reset_index()

print(members_grouped.shape)
members_grouped.head(3)

(23907, 2)


Unnamed: 0,posTitle,memberUrn
0,Badminton Head Coach,[121664157]
1,CRO specialist,[110248053]
2,Consultant for Hays Office Support,[487101656]


In [4]:
ids = members['memberUrn'].unique().tolist()

def multiEncode(urnList):
    l = [0]*len(ids)

    for urn in urnList:
        if urn:
            index = ids.index(urn)
            l[index] = 1
    return l

# Multi=encode the members
members_grouped['memberUrn'] = members_grouped['memberUrn'].apply(multiEncode)

members_grouped.head(3)

Unnamed: 0,posTitle,memberUrn
0,Badminton Head Coach,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CRO specialist,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Consultant for Hays Office Support,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [5]:
# Create a new DF with combined data
temp = pd.concat([members['posTitle'], pos_vecs], axis=1)
train = pd.merge(temp, members_grouped, on='posTitle')
train = train.dropna()
del temp

print(train.shape)
train.head(3)

(2817, 405)


Unnamed: 0,posTitle,0,1,2,3,4,5,6,7,8,...,394,395,396,397,398,399,400,401,402,memberUrn
0,Portfolio Executive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,Solution Delivery Executive,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Project Manager,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
import tensorflow as tf
from tensorflow import keras

w2v_inner_dim = 200
# Simple 2 layer model to create the word2vec matrix
# Predicting word (job) based off context (members who had that job) currently
# people --> job
model = keras.models.Sequential([
    keras.layers.Dense(w2v_inner_dim, input_dim=members['memberUrn'].nunique()),
    keras.layers.Dense(pos_vecs.shape[1]),
])

# Hyper parameters
model.compile(
    optimizer='adam',
    loss='mean_squared_error', # ! WRONG
    metrics=['accuracy']
)

model.fit(
    np.array(train['memberUrn'].tolist(), dtype=np.float32),
    np.array(train.iloc[:, 1:pos_vecs.shape[1]+1], dtype=np.float32),
    epochs=50
)

Train on 2817 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x1baf00487c8>

In [14]:
# get_weights returns weights & biases -> we want the 2nd matrix of weights (w2v_inner_dim by # of jobs)
w2v_matrix = model.get_weights()[2]

from sklearn.manifold import TSNE

# Collapse matrix into Nx2
tsne = TSNE(n_components=2, random_state=0, verbose=2)
w2v_visual = tsne.fit_transform(w2v_matrix)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 200 samples in 0.033s...
[t-SNE] Computed neighbors for 200 samples in 0.078s...
[t-SNE] Computed conditional probabilities for sample 200 / 200
[t-SNE] Mean sigma: 0.072859
[t-SNE] Computed conditional probabilities in 0.019s
[t-SNE] Iteration 50: error = 93.3557892, gradient norm = 0.4201024 (50 iterations in 0.154s)
[t-SNE] Iteration 100: error = 109.2771378, gradient norm = 0.2802648 (50 iterations in 0.143s)
[t-SNE] Iteration 150: error = 114.4096832, gradient norm = 0.3141767 (50 iterations in 0.151s)
[t-SNE] Iteration 200: error = 115.3067780, gradient norm = 0.3045754 (50 iterations in 0.110s)
[t-SNE] Iteration 250: error = 115.1217651, gradient norm = 0.3049460 (50 iterations in 0.143s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 115.121765
[t-SNE] Iteration 300: error = 3.2424083, gradient norm = 0.0031097 (50 iterations in 0.129s)
[t-SNE] Iteration 350: error = 2.8681915, gradient norm = 0.0036

In [15]:
import plotly.graph_objects as go

# Display data
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=w2v_matrix[:,0], y=w2v_matrix[:,1],
    text=train['posTitle'], # ! LABELS ARE WRONG
    mode='markers',
    marker_color='rgba(255, 182, 193, .8)'
))
fig.update_layout(title='TF-IDF Initial Career Map')