# Capstone Resumes
## Saxa 4

***

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from docx import Document
import os
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pdfplumber
import nltk
nltk.download('punkt')
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nicholasreese/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


***

In [2]:
def convert_files_to_json(folder_path, output_json_file):
    corpus = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.docx'):
            doc = Document(file_path)
            text = '\n'.join([para.text for para in doc.paragraphs])
            document_data = {
                'title': filename,
                'text': text, 
                'type': 'word',
                'file_path': file_path
            }
            corpus.append(document_data)
            print(json.dumps(corpus, ensure_ascii=False, indent=4))
            print(f"Added .docx: {filename}") 
        
        elif filename.endswith('pdf'):
            with pdfplumber.open(file_path) as pdf:
                text = '\n'.join([page.extract_text() for page in pdf.pages if page.extract_text()])
                document_data = {
                    'title': filename, 
                    'text': text,
                    'type': 'pdf',
                    'file_path': file_path
                }
                corpus.append(document_data)
                print(f"Added .pdf: {filename}")
                
    with open(output_json_file, 'w', encoding= 'utf-8') as json_file:
        json.dump(corpus, json_file, ensure_ascii = False, indent = 4)
        
convert_files_to_json('/Users/nicholasreese/Desktop/Georgetown/Capstone/NER_Modeling', 'output_corpus.join')

# change your file path to where you saved the resumes

[
    {
        "title": "Genesis Roberto 2024 Resume.docx",
        "text": "Ms. Genesis U. Roberto \n     Rockville, MD   \t (503) 995-9232\t      groberto27@gmail.com \t             Active CPA (MD, VA)\n\nProfile: A data driven, results oriented finance and accounting professional with deep experience in financial statements reporting requirement under US GAAP and SOX internal controls. Well-versed in financial statement analysis, M&A due diligence reviews and government contracting process with Master’s degree in Business Analysis from Georgetown University. \n\nProfessional Experience\t\t\t\t\nSaggar and Rosenberg, CPAs – Rockville, MD\tDec 2023 - Present\nSenior Manager, Audit Services\nManaged financial audits & reviews for corporate clients in the defense contractors, retail and engineering industry.\nSupported clients in corporate growth strategy and valuation modeling initiatives, including buy and sell-side advisory services.\nEvaluated compliance with ASC 606, ASC 842, and 

In [3]:
resumes = pd.read_json('output_json_file')

In [4]:
resumes

Unnamed: 0,title,text,type
0,Genesis Roberto 2024 Resume.docx,"Ms. Genesis U. Roberto \n Rockville, MD ...",word
1,Jonathan J Saville resume.docx,"Jonathan J Saville\n503 Edwards Ave, Apt #7 ...",word
2,Dezmond Richardson GU Q3.2024 Resume.docx.pdf,D R\nEZMOND ICHARDSON\nddr34@georgetown.edu ▪ ...,pdf
3,Nicholas Reese Resume .docx,"Nicholas E. Reese\nVan Ness, Washington D.C.\n...",word


Ashlyn's resume doesnt work because it is not a UTF-8 string. I think its because all of her paragraphs in her resume are in tables or textboxes. I saw something similar when the function did not pick up my table in my resume. 

***

In [5]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [6]:
### Converting All Text to lower case

resumes['text'] = resumes['text'].str.lower()

In [7]:
resumes

Unnamed: 0,title,text,type
0,Genesis Roberto 2024 Resume.docx,"ms. genesis u. roberto \n rockville, md ...",word
1,Jonathan J Saville resume.docx,"jonathan j saville\n503 edwards ave, apt #7 ...",word
2,Dezmond Richardson GU Q3.2024 Resume.docx.pdf,d r\nezmond ichardson\nddr34@georgetown.edu ▪ ...,pdf
3,Nicholas Reese Resume .docx,"nicholas e. reese\nvan ness, washington d.c.\n...",word


In [8]:
resumes_values = [f'res{i + 1}' for i in range(len(resumes))]

In [9]:
resumes.insert(0, 'resume', resumes_values)

In [10]:
resumes

Unnamed: 0,resume,title,text,type
0,res1,Genesis Roberto 2024 Resume.docx,"ms. genesis u. roberto \n rockville, md ...",word
1,res2,Jonathan J Saville resume.docx,"jonathan j saville\n503 edwards ave, apt #7 ...",word
2,res3,Dezmond Richardson GU Q3.2024 Resume.docx.pdf,d r\nezmond ichardson\nddr34@georgetown.edu ▪ ...,pdf
3,res4,Nicholas Reese Resume .docx,"nicholas e. reese\nvan ness, washington d.c.\n...",word


In [11]:
vec = CountVectorizer()

In [12]:
resumes_vec = vec.fit_transform(resumes['text'])

In [13]:
#print(resumes_vec)

In [14]:
resumes_vec_dense = pd.DataFrame(resumes_vec.todense(),
                                columns = vec.get_feature_names_out(),
                                index = resumes['resume'])

In [15]:
print(resumes_vec_dense)

        10  12m  15  180m  19m  20  2010  2011  2014  2015  ...  workload  \
resume                                                      ...             
res1     0    1   2     1    1   1     1     2     0     2  ...         0   
res2     0    0   0     0    0   0     0     2     1     1  ...         0   
res3     0    0   0     0    0   0     0     0     0     0  ...         0   
res4     1    0   1     0    0   1     0     0     1     0  ...         1   

        workloads  writer  writing  wrote  www  year  years  yellow  young  
resume                                                                      
res1            0       1        1      0    0     5      0       1      2  
res2            0       0        0      1    0     1      0       0      0  
res3            0       0        0      0    1     0      0       0      0  
res4            1       0        0      0    0     0      1       0      0  

[4 rows x 1049 columns]


In [16]:
print(f"Number of Resumes: {resumes_vec_dense.shape[0]}")

Number of Resumes: 4


In [17]:
print(f" Number of terms: {resumes_vec_dense.shape[1]}")

 Number of terms: 1049


***

## Bag of Words for Resumes

In [18]:
resumes_vec2 = vec.fit_transform(resumes['text'])

In [19]:
resumes_vec2 = pd.DataFrame(resumes_vec2.todense(),
                           columns = vec.get_feature_names_out())

In [20]:
resumes_cat = pd.concat([resumes[['resume']], resumes_vec2], axis = 1)

In [21]:
resumes_cat

Unnamed: 0,resume,10,12m,15,180m,19m,20,2010,2011,2014,...,workload,workloads,writer,writing,wrote,www,year,years,yellow,young
0,res1,0,1,2,1,1,1,1,2,0,...,0,0,1,1,0,0,5,0,1,2
1,res2,0,0,0,0,0,0,0,2,1,...,0,0,0,0,1,0,1,0,0,0
2,res3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,res4,1,0,1,0,0,1,0,0,1,...,1,1,0,0,0,0,0,1,0,0


***

## TD - IDF

finding the TD-IDF scores of unique words 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
tvec = TfidfVectorizer(stop_words = 'english')

In [24]:
resume_vec = tvec.fit_transform(resumes['text'])

In [25]:
resumes_tvec = pd.DataFrame(resume_vec.todense(),
                           columns = tvec.get_feature_names_out())

In [26]:
resumes_tvec

Unnamed: 0,10,12m,15,180m,19m,20,2010,2011,2014,2015,...,workload,workloads,writer,writing,wrote,www,year,years,yellow,young
0,0.0,0.017635,0.027807,0.017635,0.017635,0.013903,0.017635,0.027807,0.0,0.027807,...,0.0,0.0,0.017635,0.017635,0.0,0.0,0.069517,0.0,0.017635,0.035269
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054433,0.027216,0.027216,...,0.0,0.0,0.0,0.0,0.03452,0.0,0.027216,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.062267,0.0,0.0,0.0,0.0
3,0.038345,0.0,0.030231,0.0,0.0,0.030231,0.0,0.0,0.030231,0.0,...,0.038345,0.038345,0.0,0.0,0.0,0.0,0.0,0.038345,0.0,0.0


***

## Adding Bag of N-grams

This might help us because we may need word associations that could better filter the pairing of words

In [27]:
vec_2 = CountVectorizer(ngram_range = (1,2))

In [28]:
resume_vec2 = vec.fit_transform(resumes['text'])

In [29]:
resume_vec2 = pd.DataFrame(resume_vec2.todense(),
                          columns = vec.get_feature_names_out())

In [30]:
resume_vec2

Unnamed: 0,10,12m,15,180m,19m,20,2010,2011,2014,2015,...,workload,workloads,writer,writing,wrote,www,year,years,yellow,young
0,0,1,2,1,1,1,1,2,0,2,...,0,0,1,1,0,0,5,0,1,2
1,0,0,0,0,0,0,0,2,1,1,...,0,0,0,0,1,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,1,0,1,0,0,1,0,0,1,0,...,1,1,0,0,0,0,0,1,0,0


In [34]:
t = resume_vec2.T