In [1]:
import os
import pandas as pd
import timeit
import sqlalchemy as alch
import numpy as np
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
from bokeh.plotting import figure, output_file, show, output_notebook
%matplotlib inline

from nltk.corpus import stopwords
import requests


import pickle

In [2]:
# connect to the database
engine = alch.create_engine('sqlite:///jobs_preprocessed.db', echo=False)

In [3]:
lda = LatentDirichletAllocation(n_components=24,learning_decay=0.7)

In [4]:
# Retrieve 250k entries from the database
entries = 250000
# initialise a list to the right size for performance reasons
descriptions = [None]*entries

i=0
with engine.connect() as connection:
    result = connection.execute("SELECT jobDescription FROM jobs LIMIT 250000")
    for row in result:
        descriptions[i] = row[0]
        i+=1

In [5]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        
                             stop_words='english',          
                             lowercase=True,                   
                             token_pattern='[a-zA-Z0-9]{3,}'
                            )

data_vectorized = vectorizer.fit_transform(descriptions)

In [6]:
%%time
lda.fit(data_vectorized)

Wall time: 34min 40s


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=24, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [7]:
# pickle the variable
pickle.dump( lda, open( "best_lda_model_250k_24t.p", "wb" ) )

In [8]:
# pickle the variable
lda = pickle.load( open( "best_lda_model_250k_24t.p", "rb" ) )

In [9]:
# connect to the database
engine_new = alch.create_engine('sqlite:///jobs_LDA_24.db', echo=False)

In [10]:
metadata = alch.MetaData()
jobs = alch.Table('jobs', metadata,
              alch.Column('jobid', alch.Integer, primary_key=True),
              alch.Column('jobTopic', alch.Integer)
             )
metadata.create_all(engine_new)

In [11]:
conn = engine_new.connect()

In [12]:
%%time
with engine.connect() as connection:
    result = connection.execute("SELECT COUNT(*) FROM jobs")
    for row in result:
        count = row[0]
    
# initialise a list to the right size for performance reasons
descriptions = [None]*count

i=0
with engine.connect() as connection:
    result = connection.execute("SELECT jobDescription FROM jobs")
    for row in result:
        descriptions[i] = row[0]
        i+=1

Wall time: 41.9 s


In [13]:
%%time
#data_vectorized = vectorizer.fit_transform(descriptions[:250000])

Wall time: 0 ns


In [14]:
%%time
data_vectorized = vectorizer.transform(descriptions)

Wall time: 6min 45s


In [15]:
%%time
transformed = lda.transform(data_vectorized)

Wall time: 32min 18s


In [16]:
print(transformed[0])

[3.20512821e-04 3.52724151e-01 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.16405816e-01 3.24139264e-01
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04]


In [17]:
print(transformed[0])

[3.20512821e-04 3.52724151e-01 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.16405816e-01 3.24139264e-01
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04
 3.20512821e-04 3.20512821e-04 3.20512821e-04 3.20512821e-04]


In [18]:
indexes = [None]*count
i=0
for element in transformed:
    index = np.argmax(element)
    indexes[i] = index
    i+=1

In [19]:
print(indexes[152:162])

[22, 7, 18, 7, 15, 15, 16, 0, 5, 3]


In [20]:
%%time
i=0
f=0
entries = [None]*count
with engine.connect() as con:

    rs = con.execute('SELECT jobid FROM jobs')

    for row in rs:
        
        try:      
            entry = {'jobid': row[0],'jobTopic': int(indexes[i])}
            entries[i] = entry
        except:
            f +=1

        i+=1        
        if i %100000 == 0:
            print(i,"entries parsed")

100000 entries parsed
200000 entries parsed
300000 entries parsed
400000 entries parsed
500000 entries parsed
600000 entries parsed
700000 entries parsed
800000 entries parsed
900000 entries parsed
1000000 entries parsed
1100000 entries parsed
1200000 entries parsed
1300000 entries parsed
1400000 entries parsed
1500000 entries parsed
1600000 entries parsed
1700000 entries parsed
1800000 entries parsed
1900000 entries parsed
2000000 entries parsed
2100000 entries parsed
2200000 entries parsed
2300000 entries parsed
2400000 entries parsed
2500000 entries parsed
2600000 entries parsed
2700000 entries parsed
2800000 entries parsed
2900000 entries parsed
3000000 entries parsed
3100000 entries parsed
3200000 entries parsed
Wall time: 52.4 s


In [21]:
%%time
conn.execute(jobs.insert(), entries)
                  

Wall time: 15 s


<sqlalchemy.engine.result.ResultProxy at 0x1b480c96e80>

In [22]:
print(f)

0


In [23]:
print(f)

0
