In [1]:
import os
import pandas as pd
import timeit
import sqlalchemy as alch
import numpy as np
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import Birch

from pprint import pprint

# Plotting tools
#import pyLDAvis
#import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
#import seaborn as sns
#from bokeh.plotting import figure, output_file, show, output_notebook
%matplotlib inline

from nltk.corpus import stopwords
import requests

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from openTSNE import TSNE
from gensim.models import Doc2Vec
from sklearn.metrics import silhouette_score


import pickle
from umap import UMAP

In [2]:
# connect to the database
engine = alch.create_engine('sqlite:///jobs_preprocessed.db', echo=False)

In [None]:
# Retrieve 100k entries from the database
entries = 100000
# initialise a list to the right size for performance reasons
descriptions = [None]*entries

i=0
with engine.connect() as connection:
    result = connection.execute("SELECT jobDescription FROM jobs LIMIT 100000")
    for row in result:
        descriptions[i] = row[0]
        i+=1

In [None]:
print(descriptions[0])

In [None]:
import re
import numpy as np

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

import nltk

In [None]:
%%time
def sent_to_words(sentences):
    count = len(sentences)
    c = 0
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        c+=1
        if int(c % (count/100)) == 0:
            print(c)



In [None]:
data_words = list(sent_to_words(descriptions))

data_words_tagged = [None] * len(data_words)
for i in range(len(data_words)):
    data_words_tagged[i] = TaggedDocument(words = data_words[i], tags = ['sent{}'.format(i)])

In [None]:
%%time
model = Doc2Vec(documents = data_words_tagged, dm = 1, vector_size = 200, window = 5,
                min_count = 1, epochs = 10, workers = Pool()._processes)

In [None]:
from gensim.test.utils import get_tmpfile
model.save('doc2vec_model_200_5')

In [None]:
%%time
pca = PCA(n_components=2).fit_transform(model.docvecs.vectors_docs)

tsne = TSNE(n_jobs=7).fit(model.docvecs.vectors_docs)


In [None]:
%%time
pca_ssd = []
tsne_ssd = []
pca_silhouette = []
tsne_silhouette = []

K = range(10,25)
for k in K:
    print("Clusters:",k)
    km = KMeans(n_clusters=k)
    
    km = km.fit(pca)
    pca_ssd.append(km.inertia_)
    labels = km.labels_
    pca_silhouette.append(silhouette_score(pca, labels, metric = 'euclidean'))
    
    
    km = km.fit(tsne)
    tsne_ssd.append(km.inertia_)
    labels = km.labels_
    tsne_silhouette.append(silhouette_score(tsne, labels, metric = 'euclidean'))

In [None]:
K = range(10,25)
plt.plot(K, pca_ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances PCA')
plt.show()

plt.plot(K, tsne_ssd, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances TSNE')
plt.show()

plt.plot(K, pca_silhouette, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette score for PCA')
plt.show()

plt.plot(K, tsne_silhouette, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette score for TSNE')
plt.show()

In [21]:
%%time
# Get all descriptions
with engine.connect() as connection:
    result = connection.execute("SELECT COUNT(*) FROM jobs")
    for row in result:
        count = row[0]
    
# initialise a list to the right size for performance reasons
descriptions = [None]*count

i=0
with engine.connect() as connection:
    result = connection.execute("SELECT jobDescription FROM jobs")
    for row in result:
        descriptions[i] = row[0]
        i+=1

Wall time: 32 s


In [None]:
threshold = int(count/2)

In [None]:
model = Doc2Vec.load('doc2vec_model_200_5')

In [None]:
%%time

data_words = list(sent_to_words(descriptions[:threshold]))

In [None]:
pickle.dump( data_words, open( "D2V_data_words1.p", "wb" ) )

In [None]:
%%time
# transform the data
vectors = [None]*count
c = 0
for i in range(threshold):
    vectors[c] = model.infer_vector(data_words[i])
    c+=1
    if i %100000 == 0:
        print(i)

In [None]:
pickle.dump( vectors, open( "D2V_vectors1.p", "wb" ) )

In [None]:
%%time
data_words = list(sent_to_words(descriptions[threshold:]))

In [None]:
vectors = pickle.load( open( "D2V_vectors1.p", "rb" ) )

In [None]:
%%time
# transform the data
c = threshold
for i in range(threshold):
    vectors[c] = model.infer_vector(data_words[i])
    c+=1
    if i %100000 == 0:
        print(i)

In [None]:
pickle.dump(vectors, open( "D2V_vectors2.p", "wb" ) )

In [3]:
vectors = pickle.load(open( "D2V_vectors2.p", "rb" ) )

In [4]:
%%time
pca = PCA(n_components=3).fit_transform(vectors)

Wall time: 32.8 s


In [5]:
print(pca[:10])

[[ 0.36362406 -0.52429614 -0.7440074 ]
 [-1.1592952  -0.89467717 -0.26843852]
 [-0.20732986  0.8631112  -1.29987776]
 [-0.46274735  0.38433827  0.35954247]
 [ 1.18858631  1.72375793  1.54989091]
 [ 0.92384567 -1.91841493  0.93234553]
 [-0.32118109 -0.97607498  0.63169638]
 [ 0.8092826   0.20788231 -1.64204495]
 [ 0.17022607 -0.62036135 -1.63288667]
 [ 0.47342528 -0.37355397  0.38043337]]


In [6]:
kmeans = KMeans(n_clusters=20, init='k-means++', n_init=10, max_iter=1000, tol=0.0001, 
    verbose=1, n_jobs=7, algorithm='auto')

In [7]:
birch = Birch(n_clusters=20)

In [8]:
clusters = kmeans.fit_predict(pca)



Initialization complete
Iteration 0, inertia 1353590.4722112685
Iteration 1, inertia 1205967.7150884285
Iteration 2, inertia 1181085.1211709888
Iteration 3, inertia 1170159.3467440228
Iteration 4, inertia 1163624.9822231743
Iteration 5, inertia 1159142.0042184836
Iteration 6, inertia 1155796.068891658
Iteration 7, inertia 1153208.0984198346
Iteration 8, inertia 1151072.2211765933
Iteration 9, inertia 1149234.685365174
Iteration 10, inertia 1147602.4224424805
Iteration 11, inertia 1146142.091769948
Iteration 12, inertia 1144823.5031345529
Iteration 13, inertia 1143652.885914905
Iteration 14, inertia 1142614.6569468533
Iteration 15, inertia 1141698.1345389683
Iteration 16, inertia 1140896.520962939
Iteration 17, inertia 1140184.080089763
Iteration 18, inertia 1139564.4138144895
Iteration 19, inertia 1139030.8717277274
Iteration 20, inertia 1138568.3616180348
Iteration 21, inertia 1138165.1712733584
Iteration 22, inertia 1137809.238882441
Iteration 23, inertia 1137497.625504045
Iteration 

Iteration 41, inertia 1141191.0822879567
Iteration 42, inertia 1141064.2115836155
Iteration 43, inertia 1140935.772703136
Iteration 44, inertia 1140805.990297912
Iteration 45, inertia 1140679.1175290926
Iteration 46, inertia 1140554.3784730253
Iteration 47, inertia 1140422.1522653545
Iteration 48, inertia 1140284.0849470773
Iteration 49, inertia 1140141.9456719216
Iteration 50, inertia 1139992.9008825556
Iteration 51, inertia 1139836.159777126
Iteration 52, inertia 1139668.9433032784
Iteration 53, inertia 1139492.1069679244
Iteration 54, inertia 1139304.3740799825
Iteration 55, inertia 1139095.9733439982
Iteration 56, inertia 1138869.9803345343
Iteration 57, inertia 1138634.6785363262
Iteration 58, inertia 1138390.009405923
Iteration 59, inertia 1138145.5888479042
Iteration 60, inertia 1137904.128396935
Iteration 61, inertia 1137662.1573377082
Iteration 62, inertia 1137429.047883032
Iteration 63, inertia 1137194.2608653933
Iteration 64, inertia 1136965.8615299216
Iteration 65, inertia 

Iteration 5, inertia 1183508.6979828738
Iteration 6, inertia 1179361.7605266243
Iteration 7, inertia 1176268.635038215
Iteration 8, inertia 1173815.585256004
Iteration 9, inertia 1171739.4959211089
Iteration 10, inertia 1169949.9103165513
Iteration 11, inertia 1168357.9054415913
Iteration 12, inertia 1166907.0161741052
Iteration 13, inertia 1165531.7389762742
Iteration 14, inertia 1164189.5696769056
Iteration 15, inertia 1162839.481677519
Iteration 16, inertia 1161466.2663984085
Iteration 17, inertia 1160088.0074849727
Iteration 18, inertia 1158742.310851087
Iteration 19, inertia 1157440.795695281
Iteration 20, inertia 1156197.168011805
Iteration 21, inertia 1155029.0202387653
Iteration 22, inertia 1153962.4574761614
Iteration 23, inertia 1152919.4487218757
Iteration 24, inertia 1151864.9015771595
Iteration 25, inertia 1150736.4531366876
Iteration 26, inertia 1149457.7386687926
Iteration 27, inertia 1147959.61191872
Iteration 28, inertia 1146286.390462754
Iteration 29, inertia 1144586.

Iteration 59, inertia 1139662.6824518454
Iteration 60, inertia 1139590.799390488
Iteration 61, inertia 1139521.2729152574
Iteration 62, inertia 1139454.749526321
Iteration 63, inertia 1139389.3505069155
Iteration 64, inertia 1139326.1765749007
Iteration 65, inertia 1139263.2592738075
Iteration 66, inertia 1139202.9511222586
Iteration 67, inertia 1139146.6457405868
Iteration 68, inertia 1139091.8439636629
Iteration 69, inertia 1139039.1492676414
Iteration 70, inertia 1138988.3962960807
Iteration 71, inertia 1138937.1085262396
Iteration 72, inertia 1138886.4380324432
Iteration 73, inertia 1138836.3529687123
Iteration 74, inertia 1138786.8338209782
Iteration 75, inertia 1138738.0042667426
Iteration 76, inertia 1138688.8648211085
Iteration 77, inertia 1138638.890432467
Iteration 78, inertia 1138587.2975153273
Iteration 79, inertia 1138536.5164227707
Iteration 80, inertia 1138485.5261020644
Iteration 81, inertia 1138435.5917330363
Iteration 82, inertia 1138385.2659322296
Iteration 83, inert

Iteration 49, inertia 1135114.4043390276
Iteration 50, inertia 1135038.8353731693
Iteration 51, inertia 1134963.7720586252
Iteration 52, inertia 1134888.7515454234
Iteration 53, inertia 1134812.417094617
Iteration 54, inertia 1134735.1583715354
Iteration 55, inertia 1134657.5137094893
Iteration 56, inertia 1134578.2158680642
Iteration 57, inertia 1134497.0271337356
Iteration 58, inertia 1134418.3079467497
Iteration 59, inertia 1134340.0736779047
Iteration 60, inertia 1134263.1142328826
Iteration 61, inertia 1134186.9291278427
Iteration 62, inertia 1134112.7697054478
Iteration 63, inertia 1134041.341810885
Iteration 64, inertia 1133967.9667226323
Iteration 65, inertia 1133895.1704945117
Iteration 66, inertia 1133824.5995421014
Iteration 67, inertia 1133755.3231738303
Iteration 68, inertia 1133688.206689122
Iteration 69, inertia 1133624.0004643782
Iteration 70, inertia 1133561.2439315454
Iteration 71, inertia 1133496.5575062286
Iteration 72, inertia 1133433.1179892845
Iteration 73, inert

Iteration 3, inertia 1206031.2185671765
Iteration 4, inertia 1195674.595075523
Iteration 5, inertia 1188629.6246791393
Iteration 6, inertia 1182757.6080029747
Iteration 7, inertia 1177451.0041790318
Iteration 8, inertia 1172816.1596588837
Iteration 9, inertia 1169012.8792676663
Iteration 10, inertia 1165941.931714644
Iteration 11, inertia 1163395.33870518
Iteration 12, inertia 1161231.6484282664
Iteration 13, inertia 1159290.5706462495
Iteration 14, inertia 1157509.3709647073
Iteration 15, inertia 1155867.1504000975
Iteration 16, inertia 1154324.9431830137
Iteration 17, inertia 1152908.3092917039
Iteration 18, inertia 1151650.3834580362
Iteration 19, inertia 1150548.0529702797
Iteration 20, inertia 1149581.9777952936
Iteration 21, inertia 1148728.476556837
Iteration 22, inertia 1147953.8151818626
Iteration 23, inertia 1147259.3055667807
Iteration 24, inertia 1146620.053239637
Iteration 25, inertia 1146024.7663814232
Iteration 26, inertia 1145463.454383612
Iteration 27, inertia 1144917.

In [9]:
%%time
clusters_birch = birch.fit_predict(pca)

Wall time: 1min 39s


In [None]:
%%time
umap_reducer = UMAP(n_components=2, verbose=3).fit_transform(vectors)

UMAP(n_components=3, verbose=3)
Construct fuzzy simplicial set
Fri Jul 24 14:53:42 2020 Finding Nearest Neighbors
Fri Jul 24 14:53:42 2020 Building RP forest with 96 trees


In [None]:
umap_clusters = kmeans.fit_predict(umap_reducer) 

In [None]:
umap_clusters_birch = birch.fit_predict(umap_reducer)

In [14]:
v_array = np.array(vectors)

In [17]:
%%time
tsne = TSNE(n_components=2, n_jobs=7, verbose=3).fit(v_array)



--------------------------------------------------------------------------------
TSNE(n_jobs=7, neighbors=None, verbose=3)
--------------------------------------------------------------------------------
===> Finding 90 nearest neighbors using Annoy approximate search using euclidean distance...
   --> Time elapsed: 1259.81 seconds
===> Calculating affinity matrix...
   --> Time elapsed: 137.61 seconds
===> Calculating PCA-based initialization...
   --> Time elapsed: 22.62 seconds
===> Running optimization with exaggeration=12.00, lr=273264.33 for 250 iterations...
Iteration   50, KL divergence 10.7751, 50 iterations in 293.9639 sec
Iteration  100, KL divergence 10.1935, 50 iterations in 343.0744 sec
Iteration  150, KL divergence 10.0885, 50 iterations in 349.0134 sec
Iteration  200, KL divergence 10.0423, 50 iterations in 358.5575 sec
Iteration  250, KL divergence 9.9998, 50 iterations in 426.8196 sec
   --> Time elapsed: 1771.43 seconds
===> Running optimization with exaggeration=1.0

In [18]:
tsne_clusters = kmeans.fit_predict(tsne)



Initialization complete
Iteration 0, inertia 1066637297.9603575
Iteration 1, inertia 920002450.6690989
Iteration 2, inertia 882761768.3296977
Iteration 3, inertia 863483818.7340795
Iteration 4, inertia 852816480.6086824
Iteration 5, inertia 846966234.4702516
Iteration 6, inertia 843720307.3368224
Iteration 7, inertia 841643048.2431446
Iteration 8, inertia 840075994.511811
Iteration 9, inertia 838677182.2849685
Iteration 10, inertia 837376179.5485458
Iteration 11, inertia 836020313.7704592
Iteration 12, inertia 834686809.4978566
Iteration 13, inertia 833427055.1969389
Iteration 14, inertia 832313144.8285223
Iteration 15, inertia 831221992.431438
Iteration 16, inertia 830141224.5351139
Iteration 17, inertia 829256809.4385332
Iteration 18, inertia 828550858.9799689
Iteration 19, inertia 827936254.7423097
Iteration 20, inertia 827440841.2807791
Iteration 21, inertia 827074519.2310418
Iteration 22, inertia 826778170.35055
Iteration 23, inertia 826453931.2834365
Iteration 24, inertia 8262173

Iteration 20, inertia 803348953.6428561
Iteration 21, inertia 802743504.4968879
Iteration 22, inertia 802208155.7586437
Iteration 23, inertia 801731941.8303521
Iteration 24, inertia 801309511.3359728
Iteration 25, inertia 800882884.240834
Iteration 26, inertia 800485931.5700032
Iteration 27, inertia 800042564.3933761
Iteration 28, inertia 799507882.0472088
Iteration 29, inertia 798841990.0629505
Iteration 30, inertia 798229278.6842678
Iteration 31, inertia 797616310.3239317
Iteration 32, inertia 796978801.5882607
Iteration 33, inertia 796370181.3519707
Iteration 34, inertia 795860410.214508
Iteration 35, inertia 795475629.6502682
Iteration 36, inertia 795177491.9270439
Iteration 37, inertia 794945109.1798244
Iteration 38, inertia 794740537.39133
Iteration 39, inertia 794547033.2793908
Iteration 40, inertia 794407107.2167655
Iteration 41, inertia 794298404.2485048
Iteration 42, inertia 794208406.6821035
Iteration 43, inertia 794152480.9774455
Converged at iteration 43: center shift 0.15

Iteration 50, inertia 794947309.5443317
Iteration 51, inertia 794757729.8752433
Iteration 52, inertia 794598476.7472414
Iteration 53, inertia 794485143.8369912
Iteration 54, inertia 794351060.0289012
Iteration 55, inertia 794216100.4098958
Iteration 56, inertia 794050338.7508217
Iteration 57, inertia 793939209.5400443
Iteration 58, inertia 793848095.6511741
Iteration 59, inertia 793766219.2568653
Iteration 60, inertia 793669254.420889
Iteration 61, inertia 793571156.2396848
Iteration 62, inertia 793483892.7828504
Iteration 63, inertia 793388432.8754201
Iteration 64, inertia 793309768.0757241
Iteration 65, inertia 793238077.8938369
Iteration 66, inertia 793160765.4862556
Iteration 67, inertia 793104275.5965675
Converged at iteration 67: center shift 0.15584866776079098 within tolerance 0.19013458177123363
Initialization complete
Iteration 0, inertia 992341684.4247881
Iteration 1, inertia 856682332.4055996
Iteration 2, inertia 833789108.1785917
Iteration 3, inertia 827069862.2522519
Iter

In [19]:
tsne_clusters_birch = birch.fit_predict(tsne)

In [22]:
# initialise a list to the right size for performance reasons
ids = [None]*count

i=0
with engine.connect() as connection:
    result = connection.execute("SELECT jobid FROM jobs")
    for row in result:
        ids[i] = row[0]
        i+=1

In [23]:
%%time
d2v_clusters = [None] * count
for i in range(count):
    entry = {'jobid':ids[i],'pca':int(clusters[i]),'tsne':int(tsne_clusters[i])}
    d2v_clusters[i] = entry

Wall time: 5.43 s


In [24]:
# connect to the database
engine2 = alch.create_engine('sqlite:///Doc2Vec20t2d.db', echo=False)

In [25]:
metadata = alch.MetaData()
jobs = alch.Table('jobs', metadata,
              alch.Column('jobid', alch.Integer, primary_key=True),
              alch.Column('pca', alch.Integer),
              alch.Column('tsne', alch.Integer)
             )
metadata.create_all(engine2)

In [26]:
conn = engine2.connect()

In [27]:
%%time
conn.execute(jobs.insert(), d2v_clusters)

Wall time: 16.4 s


<sqlalchemy.engine.result.ResultProxy at 0x1e917bbbc50>

In [28]:
# connect to the database
engine2 = alch.create_engine('sqlite:///Doc2Vec20t2dBirch.db', echo=False)

In [29]:
metadata = alch.MetaData()
jobs = alch.Table('jobs', metadata,
              alch.Column('jobid', alch.Integer, primary_key=True),
              alch.Column('pca', alch.Integer),
              alch.Column('tsne', alch.Integer)
             )
metadata.create_all(engine2)

In [30]:
conn = engine2.connect()

In [31]:
%%time
d2v_clusters_birch = [None] * count
for i in range(count):
    entry = {'jobid':ids[i],'pca':int(clusters_birch[i]),'tsne':int(tsne_clusters_birch[i])}
    d2v_clusters_birch[i] = entry

Wall time: 2.63 s


In [32]:
%%time
conn.execute(jobs.insert(), d2v_clusters_birch)

Wall time: 12.6 s


<sqlalchemy.engine.result.ResultProxy at 0x1e91ca7e978>