# Labor Market Analysis

In [None]:
# The standard stuff
import pandas as pd
import numpy as np
import math

# The world's most amazing NLP library.  Thank you Matthew Honnibal.  Remind me to give you a hug when I see you.
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

# Your classic clustering 
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Ooooooooh - now THIS should be fun.
import umap
import hdbscan

# And make it look pretty!
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
import seaborn as sns

# Load SpaCy's large embedding model
nlp = spacy.load('en_core_web_lg')

### Load data and show columns

In [None]:
df = pd.read_csv('data\\online-job-postings.csv')
print(df.columns.values)

In [None]:
df.head()

In [None]:
%matplotlib inline
df.groupby('Year')['Title'].count().plot(kind='bar')

### Remove empty cells in target columns

In [None]:
df = df.dropna(subset=['Title', 'JobDescription', 'Year', 'jobpost'])

In [None]:
df.info()

### Clean up titles

In [None]:
def cleanTitles(sentence):
    newsent = ''.join(c for c in sentence if c not in punctuation)
    return ' '.join([x for x in newsent.split() if x.lower() not in STOP_WORDS])

df['cleanTitle'] = df.apply(lambda row: cleanTitles(row['Title']), axis=1)

In [None]:
df['cleanTitle'].head()

### Clean up Descriptions

#### Remove description stopwords

In [None]:
def removeStopWords(description):
    return ' '.join([x for x in description.split() if x.lower() not in STOP_WORDS])

df['cleandesc'] = df.apply(lambda row: removeStopWords(row['JobDescription']), axis=1)

In [None]:
df['cleandesc'][0]

#### Count most popular words
In this section, we'll find low-signal across all job postings, such as "job" and "performance", and manually add them to the list.  This is because there are some high-signal words at the top, such as "software".

In [None]:
alldescs = ' '.join(df['cleandesc'].values)
alldescs = ' '.join(alldescs.split())

wordcount = {}

for i in alldescs.lower().split():
    if i in wordcount: 
        wordcount[i] += 1
    else:
        wordcount[i] = 1
        
sorted_wordcount = sorted(wordcount.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted_wordcount[0:25]

From this list, we'll select the words that may help in clearing up the clustering.

In [None]:
custom_stopwords = [
    'responsible', 
    'looking', 
    'incumbent',
    'position',
    'seeking',
    'work',
    'support',
    'team',
    'candidate',
    'llc',
    'company',
    'activities',
    'ensure', 
    'armenian', 
    'candidates', 
    '-', 
    'armenia']

def removeCustomStopWords(description):
    return ' '.join([x for x in description.split() if x.lower() not in custom_stopwords])

df['newcleandesc'] = df.apply(lambda row: removeStopWords(row['cleandesc']), axis=1)
df = df.reset_index(drop=True)


### Clean up jobposts

#### Remove jobposts stopwords

In [None]:
def removeStopWords(description):
    return ' '.join([x for x in description.split() if x.lower() not in STOP_WORDS])

df['cleanjob'] = df.apply(lambda row: removeStopWords(row['jobpost']), axis=1)

In [None]:
df['cleanjob'][0]

#### Count most popular words
In this section, we'll find low-signal across all job postings, such as "job" and "performance", and manually add them to the list.  This is because there are some high-signal words at the top, such as "software".

In [None]:
alldescs = ' '.join(df['cleanjob'].values)
alldescs = ' '.join(alldescs.split())

wordcount = {}

for i in alldescs.lower().split():
    if i in wordcount: 
        wordcount[i] += 1
    else:
        wordcount[i] = 1
        
sorted_wordcount = sorted(wordcount.items(), key=lambda x: x[1], reverse=True)

In [None]:
sorted_wordcount[0:25]

In [None]:
custom_stopwords = [
    '-',
    'job', 
    'application',
    'website',
    'ability',
    'responsible', 
    'looking', 
    'incumbent',
    'position',
    'seeking',
    'work',
    'support',
    'team',
    'candidate',
    'llc',
    'company',
    'activities',
    'ensure', 
    'armenian', 
    'candidates', 
    '-', 
    'armenia']

def removeCustomStopWords(description):
    return ' '.join([x for x in description.split() if x.lower() not in custom_stopwords])

df['newcleanjob'] = df.apply(lambda row: removeStopWords(row['cleanjob']), axis=1)
df = df.reset_index(drop=True)


### Generate vectors for titles, descriptions, and job posts
We are going to create 3 vectors for each job posting: An average word embedding for the title itself, one for the basic description, and because we have a bunch of one-liners, we'll also add the entire post.

In [None]:
vectorsTitles        = []
vectorsDescriptions  = []
vectorsJobPosts      = []
counter = 0 

for index, row in df.iterrows():    
    vectorsTitles.append(nlp(row['cleanTitle']).vector)    
    vectorsDescriptions.append(nlp(row['newcleandesc']).vector)
    vectorsJobPosts.append(nlp(row['newcleanjob']).vector)
    
    # show progress every 1000 - maybe go grab some coffee.
    counter = counter + 1
    if counter % 1000 == 0:
        print(counter)

In [None]:
numclusters   = range(1, 20)
kmeans        = [KMeans(n_clusters=i) for i in numclusters]


kTitles       = [kmeans[i].fit(vectorsTitles) for i in range(len(kmeans))]
scoreTitles   = [kTitles[i].score(vectorsTitles) for i in range(len(kmeans))]
print('scoreTitles done.')
clusterTitles = [kTitles[i].labels_ for i in range(len(kmeans))]
print('clusterTitles done.')

kDescriptions       = [kmeans[i].fit(vectorsDescriptions) for i in range(len(kmeans))]
scoreDescriptions   = [kDescriptions[i].score(vectorsDescriptions) for i in range(len(kmeans))]
print('scoreDescriptions done.')
clusterDescriptions = [kDescriptions[i].labels_ for i in range(len(kmeans))]
print('clusterDescriptions done.')

kJobPosts       = [kmeans[i].fit(vectorsJobPosts) for i in range(len(kmeans))]
scoreJobPosts   = [kJobPosts[i].score(vectorsJobPosts) for i in range(len(kmeans))]
print('scoreJobPosts done.')
clusterJobPosts = [kJobPosts[i].labels_ for i in range(len(kmeans))]
print('clusterJobPosts done.')

In [None]:
 %matplotlib inline

sns.set()
fig, axs = plt.subplots(nrows=3, figsize=(15,20))
pd.Series([i for i in scoreTitles]      ).plot(ax=axs[0])
pd.Series([i for i in scoreDescriptions]).plot(ax=axs[1])
pd.Series([i for i in scoreJobPosts]    ).plot(ax=axs[2])
axs[0].set_title('Clustering score for job titles')
axs[1].set_title('Clustering score for job descriptions')
axs[2].set_title('Clustering score for full job posts')

### Find silhouette scores for each vector

In [None]:
silhouetteTitles   = [silhouette_score(vectorsTitles, kmeans[i + 1].fit_predict(vectorsTitles)) for i in range(len(kmeans) - 1)]
print('silhouetteTitles done.')

silhouetteDescriptions   = [silhouette_score(vectorsDescriptions, kmeans[i + 1].fit_predict(vectorsDescriptions)) for i in range(len(kmeans) - 1)]
print('silhouetteDescriptions done.')

silhouetteJobPosts   = [silhouette_score(vectorsJobPosts, kmeans[i + 1].fit_predict(vectorsJobPosts)) for i in range(len(kmeans) - 1)]
print('silhouetteJobPosts done.')

In [None]:

silhouetteLabels = [k+1 for (k,v) in enumerate(silhouetteTitles)]
fig, axs = plt.subplots(nrows=3, figsize=(15,20))
loc = plticker.MultipleLocator(base=1.0)

axs[0].plot(silhouetteLabels, silhouetteTitles)
axs[0].xaxis.set_major_locator(loc)
axs[0].set_title('Silhouette score for job titles')

axs[1].plot(silhouetteLabels, silhouetteDescriptions)
axs[1].xaxis.set_major_locator(loc)
axs[1].set_title('Silhouette score for job descriptions')

axs[2].plot(silhouetteLabels, silhouetteJobPosts)
axs[2].xaxis.set_major_locator(loc)
axs[2].set_title('Silhouette score for full job posts')

axs[1].plot([1, 18], [0.05, 0.05], linestyle='dashed', lw=2)


Well, those are terrible results.  Let's switch to the big boys.

# Fire up UMAP and HDBSCAN
Woah, Nelly!  Now we're going to play with the fun stuff.  

In [None]:
# Ideal parameters I've found.  These are the values to experiment with.

n_neighbors=40
min_cluster_size=100

In [None]:
# Concatenate arrays 
newVectors = np.concatenate([vectorsTitles, vectorsDescriptions, vectorsJobPosts], axis=1)
np.shape(newVectors)

In [None]:
reducer = umap.UMAP(n_neighbors=n_neighbors)
embeddings = reducer.fit_transform(newVectors)

In [None]:
hdb = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size)
scoreTitles = hdb.fit(embeddings)
print("hdb done")
clusterTitles = scoreTitles.labels_

In [None]:
print(clusterTitles[0:500])

### Let's show the pretty clusters!

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig, axs = plt.subplots(nrows=1, figsize=(15,10))
plt.scatter(embeddings[:, 0], embeddings[:, 1],  s=1.5, cmap='rainbow', c=clusterTitles)

In [None]:
clusterTitles.max()

In [None]:
print("Unclustered values: ", list(clusterTitles).count(-1))
print("Total Values: ", len(list(clusterTitles)))
print("Data usage: {0:.2f}".format((len(list(clusterTitles)) - list(clusterTitles).count(-1)) / len(list(clusterTitles))))

In [None]:
#clusterTitles
hddf = df
hddf['cluster'] = clusterTitles
hddf[hddf['cluster'] == 2]['cleanTitle']

In [None]:
numClusters = clusterTitles.max()
hddf.head()

In [None]:
hddf.to_csv('jobClusters.csv', index=False)

### Show most popular words for each cluster

#### Count each word, per cluster

In [None]:
hddf = pd.read_csv('jobClusters.csv')

wordcount = {}
sorted_wordcount = {}
for i in range(max(hddf['cluster'].unique()) + 1):
    titles = hddf[hddf['cluster'] == i]["cleanTitle"].values
    titles = ' '.join(' '.join(titles).split())
    wordcount[i] = {}
    for j in titles.lower().split():
        if j in wordcount[i]: 
            wordcount[i][j] += 1
        else:
            wordcount[i][j] = 1
        
    sorted_wordcount[i] = sorted(wordcount[i].items(), key=lambda x: x[1], reverse=True)
    print("Cluster: " + str(i))
    print(sorted_wordcount[i][0:5])
    print('---')


#### Find the top 5 words (to get a sense of the cluster's contents)

In [None]:
topwords = {}
for key, i in sorted_wordcount.items():
    print("Cluster "+str(key)+": ", end='')
    topwords[key] = ''
    for newkey, j in sorted_wordcount[key][0:4]:
        print(newkey + ' / ', end='')
        topwords[key] = topwords[key] + newkey + ' / '
    print()

#### A bit of manual cleanup for our purposes

In [None]:
# Lawyers
#hddf['cluster'] = hddf['cluster'].replace(to_replace=24, value=0, inplace=True, axis=1)

def mergeClusters(merge_target, origin):
    hddf.loc[hddf['cluster'] == merge_target, 'cluster'] = origin
    sorted_wordcount[origin] = sorted_wordcount[merge_target]
    sorted_wordcount.pop(merge_target, None)
    

# Programmers and QA engineers
mergeClusters(8, 5)
mergeClusters(9, 5)
mergeClusters(19, 5)

# QA Engineers
mergeClusters(18, 14)

# Accounting
mergeClusters(7, 1)

# Executive Assistants
mergeClusters(13, 3)

# Office Managers and HR
mergeClusters(16, 10)

# Project Managers
mergeClusters(27, 12)

# Legal
mergeClusters(24, 0)


In [None]:
hddf['cluster'].unique()

#### Find top word (for legend)

In [None]:
firsttopword = {}
for key, i in sorted_wordcount.items():
    print("Cluster "+str(key)+": ", end='')
    firsttopword[key] = [k for k in sorted_wordcount[key][0]][0]
    print(firsttopword[key])

In [None]:
clusterList = list(hddf['cluster'].unique())
clusterList.remove(-1)
print(clusterList)

datecount = {}
for i in clusterList:
    dates = hddf[hddf['cluster'] == i]['Year'].values
    datecount[i] = {}
    for jobdate in dates:
        if jobdate in datecount[i]: 
            datecount[i][jobdate] += 1
        else:
            datecount[i][jobdate] = 1

datedf = pd.DataFrame(datecount)

# fill in NaN's
datedf.fillna(value=0, inplace=True)

### Normalize the view to see the trends

In [None]:
x = datedf.values 
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
normdf = pd.DataFrame(x_scaled)


In [None]:
firsttopword

In [None]:
%matplotlib inline
fig, axs = plt.subplots(nrows=1, figsize=(20,15))

sns.set_palette("husl", 20)  # oooh, pretty

datedf.plot(ax=axs)

axs.set_title('Job demand over time, per industry')

L=plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), ncol=int(numClusters/3))
for i in clusterList:
    L.get_texts()[clusterList.index(i)].set_text(firsttopword[i])

#### Find top industries (Top values and fastest growth)