In [19]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim

tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = set(stopwords.words('english'))

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."

# compile sample documents into a list
doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

In [20]:
from pdftojsonl import process_pdfs

process_pdfs('c:\\tools\\code\\ResumeAnalytics\\Candidate Resumes', 'candidate.jsonl')
process_pdfs('c:\\tools\\code\\ResumeAnalytics\\Target Resumes', 'target.jsonl')

import json
with open('candidate.jsonl', 'r') as cf, open('target.jsonl', 'r') as tf:
  candidate = [json.loads(line) for line in cf]
  target = [json.loads(line) for line in tf]
    
mashed_target = ''
doc_set = []
for t in target:
  mashed_target += t['content']
  doc_set.append(t['content'])
target = [{'name':'target', 'content': mashed_target}]

can_set = []
for c in candidate:
  can_set.append(c['content'])


In [21]:

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:
    
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

In [22]:
print (ldamodel.print_topics(num_topics = 20, num_words = 4))

[(0, '0.001*develop + 0.001*comput + 0.001*research + 0.001*use'), (1, '0.001*experi + 0.001*9 + 0.001*stanford + 0.001*analysi'), (2, '0.001*develop + 0.001*univers + 0.001*work + 0.001*experi'), (3, '0.001*develop + 0.001*project + 0.001*research + 0.001*experi'), (4, '0.001*project + 0.001*librari + 0.001*develop + 0.001*work'), (5, '0.023*project + 0.013*develop + 0.012*app + 0.012*librari'), (6, '0.001*project + 0.001*develop + 0.001*app + 0.001*work'), (7, '0.001*project + 0.001*app + 0.001*work + 0.001*develop'), (8, '0.001*develop + 0.001*comput + 0.001*b + 0.001*mathemat'), (9, '0.016*experi + 0.016*9 + 0.016*stanford + 0.013*va'), (10, '0.001*research + 0.001*comput + 0.001*develop + 0.001*applic'), (11, '0.001*stanford + 0.001*va + 0.001*06 + 0.001*work'), (12, '0.001*project + 0.001*develop + 0.001*app + 0.001*end'), (13, '0.017*univers + 0.017*st + 0.017*kenyon + 0.017*loui'), (14, '0.020*research + 0.016*develop + 0.013*web + 0.013*comput'), (15, '0.001*develop + 0.001*pr

In [32]:
for i in doc_set:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    print (ldamodel.get_document_topics(dictionary.doc2bow(stemmed_tokens)))
    
print (">>>>Candidates:<<<<\n\n")

for i in candidate:
    raw = i['content'].lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in en_stop]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    i['distance'] = 0;
    for topic in ldamodel.get_document_topics(dictionary.doc2bow(stemmed_tokens)):
        i['distance'] = i['distance'] + topic[1]
    print (i['name'] + ": " + str(i['distance']))
    print (ldamodel.get_document_topics(dictionary.doc2bow(stemmed_tokens)))



[(9, 0.99666666666295112)]
[(14, 0.99654545454111176)]
[(5, 0.99872654155047613)]
[(13, 0.99543269229798748)]
>>>>Candidates:<<<<


Aberra Aimen 7494924 Resume.txt: 0.996172248804
[(5, 0.37504208109239018), (9, 0.17951671068742173), (13, 0.20064036259802667), (14, 0.24097309442598883)]
Agnihotri Vasant 4883545 Resume.txt: 0.998464491363
[(5, 0.50471367471945328), (9, 0.13683959118554778), (13, 0.052641003600650632), (14, 0.30427022185711261)]
Armstrong Mike 6086999 Resume.txt: 0.995789473684
[(5, 0.5059493437786472), (9, 0.22216904009075572), (13, 0.07893224467916117), (14, 0.18873884513564596)]
Bederu Mariam 6728650 Resume.txt: 0.9961352657
[(5, 0.45228752792708787), (9, 0.14570385962745505), (13, 0.085520162511582704), (14, 0.3126237156343572)]
Chen John 5261366 Resume.txt: 0.99794344473
[(5, 0.58230993939984288), (9, 0.21253273694014654), (13, 0.043644951754066727), (14, 0.15945581663602063)]
Coleman Brad 621556 Resume.txt: 0.996850393701
[(5, 0.48772696184732395), (9, 0.11211643608