In [89]:
import gensim
import nltk
import numpy as np
import re
import pandas as pd
nltk.download('stopwords')

from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords
from docx import Document
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to /home/nvidia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

def clean_text(text):
    return_text = _removeNonAscii(text)
    return_text = make_lower_case(return_text)
    return_text = remove_stop_words(return_text)
    return_text = remove_html(return_text)
    return_text = remove_punctuation(return_text)
    return return_text

In [30]:
filename = './input/cover_letter.docx'
doc = Document(filename)
full_cover_letter = ""
for p in doc.paragraphs:
    full_cover_letter += p.text

text_resume = str(full_cover_letter) # Summarize the text with ratio 0.1 (10% of the total words.)
summarize(text_resume, ratio=0.3)

'I want to work for <company> because I believe that your company is the perfect intersection between my passion for innovative media and brand communication as well as my skills gained from diverse copywriting experiences.\nAs the Event Planning Intern at the Greenwich Village Chelsea Chamber of Commerce (GVCCC), I gained significant experience in copywriting for event marketing materials.\nIn this position, I drafted and edited several rounds of event copy, online event registration pages, and email invitations to ensure brand consistency through writing style, tone, and voice.\nThis experience taught me about the importance of achieving a solid understanding of each organization’s goals to create a marketing message built on a strong synthesis of all brands.\nI believe that this skill will transfer to this position because of Moss Adams’s diverse set of relationships and messaging when working with various clients.\nSince we regularly work on more than one event at a time, I possess

In [33]:
job_desc = """Rowman & Littlefield is seeking an entry-level Editorial Assistant for its Lanham, Maryland, office. The Editorial Assistant will support senior editors in its busy editorial production department, assisting with many aspects of production of academic and trade titles, as well as some administrative functions. Ideal candidate has a background in the humanities, attention to detail, and a desire to earn experience in the book publishing industry.

Requirements
• BA or BS, preferably in English or journalism (minor or area of concentration in history, religion, philosophy, political science, or any other discipline in which RLPG publishes).
• Knowledge of the Chicago Manual of Style (familiarity with APA and other style guides a bonus)
• Knowledge of MS Office suite (Word, Excel, Access) and Windows
• Excellent communication and time-management skills"""

summarize(job_desc, ratio=0.5)

'Ideal candidate has a background in the humanities, attention to detail, and a desire to earn experience in the book publishing industry.\n• BA or BS, preferably in English or journalism (minor or area of concentration in history, religion, philosophy, political science, or any other discipline in which RLPG publishes).\n• Knowledge of MS Office suite (Word, Excel, Access) and Windows\n• Excellent communication and time-management skills'

## Method 1

In [36]:
def similarity(document_array, query_array):
    file_docs = document_array
    file2_docs = query_array
    avg_sims = []
    
    gen_docs = [[w.lower() for w in word_tokenize(text)] 
                for text in file_docs]

    dictionary = gensim.corpora.Dictionary(gen_docs)
    corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
    tf_idf = gensim.models.TfidfModel(corpus)
    sims = gensim.similarities.Similarity('sim_checkpoints/',tf_idf[corpus],
                                        num_features=len(dictionary))
            
    for line in file2_docs:
        query_doc = [w.lower() for w in word_tokenize(line)]
        query_doc_bow = dictionary.doc2bow(query_doc)
        query_doc_tf_idf = tf_idf[query_doc_bow]
        print('Comparing Result for {}: {}'.format(line, sims[query_doc_tf_idf]))
        sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
        avg = sum_of_sims / len(file_docs)
        print(f'avg: {sum_of_sims / len(file_docs)}')
        avg_sims.append(avg)  
    total_avg = np.sum(avg_sims, dtype=np.float)
    print(total_avg)
    percentage_of_similarity = round(float(total_avg) * 100)
    if percentage_of_similarity >= 100:
        percentage_of_similarity = 100


In [39]:
job_desc_cleaned = [clean_text(sentence) for sentence in job_desc.split('.')]
clean_cover_letter = [clean_text(sentence.text) for sentence in doc.paragraphs]
similarity(job_desc_cleaned, clean_cover_letter)
# print(job_desc_cleaned)
# clean_cover_letter

Comparing Result for cassandra chia: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for : [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for 728 bowery st iowa city ia 52240 202 718 8644: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for cassqchia gmail com: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for 19 october 2020: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for : [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for moss adams: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for 999 third avenue suite 2800: [0.         0.         0.         0.         0.19871596]
avg: 0.03974319100379944
Comparing Result for seattle wa 98104 4057: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for : [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for dear hiring manager: [0. 0. 0. 0. 0.]
avg: 0.0
Comparing Result for writing express interest position currently completing last semester university iowa english creative writing major literary publishing track graduation date december 2020 available work january 4 2021 want work be

## Method 2

In [68]:
corpus = ["I'd like an apple", "An apple a day keeps the doctor away", "Never compare an apple to an orange", "I prefer scikit-learn to Orange", "The scikit-learn docs are Orange and Blue"]
# corpus = clean_cover_letter
# corpus.append(clean_text(job_desc))
vect = TfidfVectorizer(min_df=1, stop_words="english")                        
tfidf = vect.fit_transform(corpus)                                                                      
pairwise_similarity = tfidf * tfidf.T 
pairwise_similarity.toarray()

array([[1.        , 0.17668795, 0.27056873, 0.        , 0.        ],
       [0.17668795, 1.        , 0.15439436, 0.        , 0.        ],
       [0.27056873, 0.15439436, 1.        , 0.19635649, 0.16815247],
       [0.        , 0.        , 0.19635649, 1.        , 0.54499756],
       [0.        , 0.        , 0.16815247, 0.54499756, 1.        ]])

In [70]:
arr = pairwise_similarity.toarray() 
np.fill_diagonal(arr, np.nan)
input_doc = corpus[-1]
input_idx = corpus.index(input_doc)
print(input_idx)

result_idx = np.nanargmax(arr[input_idx]) 
print(corpus[result_idx])
print(arr[input_idx])

4
I prefer scikit-learn to Orange
[0.         0.         0.16815247 0.54499756        nan]


## Inference

In [71]:
# filename = './input/resume_002.docx'
# doc = Document(filename)
# full_cover_letter = ""
# for p in doc.paragraphs:
#     full_cover_letter += p.text

# for table in doc.tables:
#     for row in table.rows:
#         for cell in row.cells:
#             full_cover_letter += cell.text

# text_resume = str(full_cover_letter) # Summarize the text with ratio 0.1 (10% of the total words.)
# summarize(text_resume, ratio=0.3)

In [82]:
f = open("./input/resume.txt", "r")
exp_array = []
job_id_array = []
job_id = 0 
for x in f:
  if ('<sep>' in x): 
      print("")
      job_id += 1
  elif ('<job>' in x): print("Job title: {}".format(x.strip().replace('<job>','')))
  elif ('<com>' in x): print("Company: {}".format(x.strip().replace('<com>','')))
  else: 
      print(x.strip())
      exp_array.append(clean_text(x.strip()))
      job_id_array.append(job_id)

Job title: Software Engineer - 09/2015 to 05/2019
Company: Luna Software, New York
Investigation, design, and implement scalable applications for data identification, analysis, retrieval, and indexing.
Software design and development while remaining concentrate on client needs.
Cooperate diligently with other IT team members to plan, design, and develop smart solutions.
Estimate interface between hardware and software.
Interface with business analysts, developers, and technical support to determine optimal specifications.

Job title: Junior Software Engineer - 09/2014 to 09/2015
Company: AdsPro Software, New York
Consulted regularly with customers on project status, proposals and technical issues.
Transformed existing software to correct errors, upgrade interfaces, and improve efficiency.
Cooperate diligently with other IT team members to plan, design, and develop smart solutions.


In [83]:
f = open("./input/job_desc.txt", "r")
job_desc_string = ""
for x in f:
    job_desc_string += x.strip()

job_desc_string = clean_text(job_desc_string)

In [84]:
corpus = exp_array
corpus.append(job_desc_string)

vect = TfidfVectorizer(min_df=1, stop_words="english")
tfidf = vect.fit_transform(corpus)
pairwise_similarity = tfidf * tfidf.T 
pairwise_similarity.toarray()

array([[1.        , 0.04842342, 0.04820191, 0.        , 0.        ,
        0.        , 0.        , 0.04820191, 0.05942944],
       [0.04842342, 1.        , 0.06100803, 0.10168122, 0.        ,
        0.        , 0.06203475, 0.06100803, 0.12702242],
       [0.04820191, 0.06100803, 1.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.07524141],
       [0.        , 0.10168122, 0.        , 1.        , 0.14276461,
        0.        , 0.08194772, 0.        , 0.04807007],
       [0.        , 0.        , 0.        , 0.14276461, 1.        ,
        0.06958921, 0.        , 0.        , 0.02242913],
       [0.        , 0.        , 0.        , 0.        , 0.06958921,
        1.        , 0.        , 0.        , 0.00992565],
       [0.        , 0.06203475, 0.        , 0.08194772, 0.        ,
        0.        , 1.        , 0.        , 0.0293271 ],
       [0.04820191, 0.06100803, 1.        , 0.        , 0.        ,
        0.        , 0.        , 1.        , 0.07524141],


In [85]:
arr = pairwise_similarity.toarray() 
np.fill_diagonal(arr, np.nan)
input_doc = corpus[-1]
input_idx = corpus.index(input_doc)
print(input_idx)

result_idx = np.nanargmax(arr[input_idx]) 
print(corpus[result_idx])
print(arr[input_idx])

8
software design development remaining concentrate client needs
[0.05942944 0.12702242 0.07524141 0.04807007 0.02242913 0.00992565
 0.0293271  0.07524141        nan]


In [86]:
arr[input_idx][0:-1]

array([0.05942944, 0.12702242, 0.07524141, 0.04807007, 0.02242913,
       0.00992565, 0.0293271 , 0.07524141])

In [93]:
data = {'exp': exp_array[0:-1], 'job_id': job_id_array, 'sim': arr[input_idx][0:-1]}

In [95]:
df = pd.DataFrame(data) 
df.head()

Unnamed: 0,exp,job_id,sim
0,investigation design implement scalable applic...,0,0.059429
1,software design development remaining concentr...,0,0.127022
2,cooperate diligently team members plan design ...,0,0.075241
3,estimate interface hardware software,0,0.04807
4,interface business analysts developers technic...,0,0.022429


In [97]:
df.sort_values(by=['job_id', 'sim'], ascending=False)

Unnamed: 0,exp,job_id,sim
7,cooperate diligently team members plan design ...,1,0.075241
6,transformed existing software correct errors u...,1,0.029327
5,consulted regularly customers project status p...,1,0.009926
1,software design development remaining concentr...,0,0.127022
2,cooperate diligently team members plan design ...,0,0.075241
0,investigation design implement scalable applic...,0,0.059429
3,estimate interface hardware software,0,0.04807
4,interface business analysts developers technic...,0,0.022429


In [None]:
# TODO: write the new order to file
# TODO: allow users to use <ignore> tag for sentences that are more explanatory for whole job
# TODO: question if this is really getting to the heart of the problem. Simple ML may not be able to tackle the harder issue of ensuring this is all correct. Maybe instead, if we have 3 different resumes, we are to categorize them according to job description and somehow decide which ones to send off? Either way, a simple prototype like this may not be the best method of going forward for now.