#### Pending
1. Processing docx file<br>
   
#### Configuration file
1. Directory where resumes are stored<br>
2. The extra words file<br>
3. Temporary resume file<br>
4. Comprehensive dictionary file<br>
5. Directory where JDs are stored<br>

### Imports

In [None]:
# Brown dictionary
from nltk.corpus import brown
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# For punctuation marks
import string
# For reading docx file
try:
    from xml.etree.cElementTree import XML
except ImportError:
    from xml.etree.ElementTree import XML
import zipfile
# For reading pdf file
import PyPDF2
# For regular expression
import re
from docx import Document
# For similarity in document comparison
import gensim
# For reading directory
import os

### Reading the config file name in main directory

In [None]:
working_dir = './Resume/'
config_file = [file for file in os.listdir(working_dir) if os.path.isfile(os.path.join(working_dir,file)) and 'config_' in file]
config_file_path = working_dir + config_file[0]

### Read the configuration file that has 
1. Resume Directory
2. Path of Extra dictionary
3. Path of temporary resume file
4. Path of my clean dictionary
5. JD Directory

In [None]:
# Read the configuration file that has path of resume
config_file = open(file=config_file_path, mode='r')
for line in config_file:
    if line[:7] == 'resdir=':
        resume_dir = line[7:]
        # Removing new line character from the end
        resume_dir = re.sub(r'[\n\r]+$', '', resume_dir)
        
    if line[:7] == 'exdict=':
        extra_dict_path = line[7:]
        # Removing new line character from the end
        extra_dict_path = re.sub(r'[\n\r]+$', '', extra_dict_path)

    if line[:7] == 'tempre=':
        temp_resume_txt_path = line[7:]
        # Removing new line character from the end
        temp_resume_txt_path = re.sub(r'[\n\r]+$', '', temp_resume_txt_path)

    if line[:7] == 'mydict=':
        my_dict_path = line[7:]
        # Removing new line character from the end
        my_dict_path = re.sub(r'[\n\r]+$', '', my_dict_path)

    if line[:7] == 'jd_dir=':
        jd_dir = line[7:]
        # Removing new line character from the end
        jd_dir = re.sub(r'[\n\r]+$', '', jd_dir)

# Dictionaries

### Read the clean dictionary I created

In [None]:
http = urllib3.PoolManager()
response = http.request('GET' my_dict_path)
my_dict = set(response.data.decode('utf-8').split())

### Read extra words files

In [None]:
with open(file=extra_dict_path, mode='r') as extra_dict_file:
    extra_dict = set(extra_dict_file.read().split())

### Read the inbuilt brown dictionary

In [None]:
brown_dict = set(brown.words())

### Union of dictionaries

In [None]:
final_good_word_list = extra_dict | brown_dict | my_dict

# Creating Resume and JD path list
##### Resumes have to start with resume_ and Job Descriptions have to start with JD_

In [None]:
resume_path_list = [resume_dir + file for file in os.listdir(resume_dir) if os.path.isfile(os.path.join(resume_dir,file)) and 'resume_' in file]
jd_path_list = [jd_dir + file for file in os.listdir(jd_dir) if os.path.isfile(os.path.join(jd_dir,file)) and 'JD_' in file]

# Spelling Mistake

In [None]:
spelling_mistakes = {}
counter = 1
# Loop through all the resume files
for path in resume_path_list:
    index_of_last_dot = path.rfind(".")
    resume_type = path[index_of_last_dot+1:]
    
    # Logic to read the file based on the extension
    if resume_type == "txt":
        with open(file=resume_path, mode='r') as resume_file:
            resume_words = resume_file.read()
            
    if resume_type == "pdf":
        with open(file=resume_path, mode='rb') as pdf_resume, open(file=temp_resume_txt_path, mode='w') as temp_txt_resume:
            read_pdf = PyPDF2.PdfFileReader(pdf_resume)
            total_pages = read_pdf.getNumPages()
            
            for page_number in range(total_pages):
                page = read_pdf.getPage(page_number)
                page_content = page.extractText()
                temp_txt_resume.write(page_content)

        with open(file=temp_resume_txt_path, mode='r') as temp_txt_resume:
            resume_words = temp_txt_resume.read()
        
    # Cleaning the data
    # Removing numbers
    resume_words = re.sub(r'\d+', '', resume_words)
    # Removing punctuations
    resume_words = re.sub(r'[^a-zA-Z0-9\s]', ' ', resume_words)
    # Removing extra new lines and tab
    resume_words = re.sub(r"[\n\t]*", "", resume_words)
    # Convert all the words that have only first letter as uppercase to all lower case example - "Test" will be converted to
    # "test". "TEST" will not be converted
    for w in re.findall(r'[A-Z][a-z]+', resume_words):
        resume_words = resume_words.replace(w, w.lower())

    # Remove Stopwords
    tokenized_words = word_tokenize(resume_words)
    stop_words = set(stopwords.words("english"))
    tokenized_words = [w for w in tokenized_words if not w in stop_words]

    # Logic to find out the spelling mistake
    wrong_spelling_resume = []
    wrong_spelling_resume = [word for word in tokenized_words if not word in final_good_word_list]

    # Eliminating duplicates
    unique_wrong_spelling_resume = []
    [unique_wrong_spelling_resume.append(word) for word in wrong_spelling_resume if word not in unique_wrong_spelling_resume]
    
    # Writing spelling mistake in a dictionary
    spelling_mistakes[str(counter)] = unique_wrong_spelling_resume
    # Increase the counter for next resume
    counter = counter + 1

### Print the spelling errors counts

In [None]:
for counter in range(len(spelling_mistakes)):
    print("Number of spelling mistake in resume", counter+1, "=", len(spelling_mistakes[str(counter+1)]))

### Print Spelling Mistakes

In [None]:
for counter in range(len(spelling_mistakes)):
    print("Spelling mistakes in resume", counter+1, "are as follows")
    print("*" *100)
    print(spelling_mistakes[str(counter+1)])
    print("*" *100, "\n")

# Similarity Matching - Document Comparison

### Read all Job description files

In [None]:
jd_combine = []
for path in jd_path_list:
    with open(file=path, mode='r') as jd_file:
        jd = jd_file.read()
    jd_combine.append(jd)

### For Job Description docs
1. Tokenize
2. Create dictionary
3. Corpus
4. tf_idf model
5. Similarity object

In [None]:
# Step 1 - Tokenize
jd_docs = [[w.lower() for w in word_tokenize(text)]
          for text in jd_combine]

# Step 2 - Create dictionary from list of documents. A dictionary maps every word to a number. Dictionary will have only unique
# words. 
jd_dictionary = gensim.corpora.Dictionary(jd_docs)

# Step 3 - Create corpus from the dictionary. A corpus is a list of Bag Of Words (bow). A bow representaiton for a document
# just lists the number of times each word occurs in the document.
jd_corpus = [jd_dictionary.doc2bow(jd_doc) for jd_doc in jd_docs]

# Step 4 - Create tf-idf model for corpus
jd_tf_idf = gensim.models.TfidfModel(jd_corpus)

# Step 5 - Create a similarity measure object in tf-idf space. Term frequency is how often the word shows up in the document 
# and inverse document fequency scales the value by how rare the word is in the corpus.
jd_sims = gensim.similarities.Similarity(working_dir, jd_tf_idf[jd_corpus], num_features=len(jd_dictionary))

### Loop through the resumes

In [None]:
jd_resume_similarity = {}
counter = 1
# Loop through all the resume files
for path in resume_path_list:
    index_of_last_dot = path.rfind(".")
    resume_type = path[index_of_last_dot+1:]

    # Logic to read the file based on the extension
    if resume_type == "txt":
        with open(file=resume_path, mode='r') as resume_file:
            resume_words = resume_file.read()
            
    if resume_type == "pdf":
        with open(file=path, mode='rb') as pdf_resume, open(file=temp_resume_txt_path, mode='w') as temp_txt_resume:
            read_pdf = PyPDF2.PdfFileReader(pdf_resume)
            total_pages = read_pdf.getNumPages()
            
            for page_number in range(total_pages):
                page = read_pdf.getPage(page_number)
                page_content = page.extractText()
                temp_txt_resume.write(page_content)

        with open(file=temp_resume_txt_path, mode='r') as temp_txt_resume:
            resume_words = temp_txt_resume.read()

    # Tokenize Resume
    resume_doc = [w.lower() for w in word_tokenize(resume_words)]
    # Create BOW from dictionary
    resume_doc_bow = jd_dictionary.doc2bow(resume_doc)
    # Create tf-idf
    resume_doc_tf_idf = jd_tf_idf[resume_doc_bow]
    # Writing Similarity in dictionary
    jd_resume_similarity[str(counter)] = jd_sims[resume_doc_tf_idf]
    # Increase the counter for next resume
    counter = counter + 1

### Print the similarity %

In [None]:
for counter in range(len(jd_resume_similarity)):
    print("Similarity percent of Resume", counter+1, "with JDs provided are")
    for i in range(len(jd_resume_similarity[str(counter+1)])):
        print(round(jd_resume_similarity[str(counter+1)][i]*100, 2))
    print("\n")