# Building a Resume Parser: Turning Unstructured Data into a Dataset
### Chris Tarzian
##### 6/23/2022

#### The code below was built to turn unstructured data into a structured dataset. It works to parse through a candidate's resume and returns their Full Name, Cell Phone, Email, Location, College, Education Level, Company, Job Title and Skillset. <br><br>I utilized various NLP techniques throughout including n-grams, punctuation removal, tokenization, text-cleaning, and the removing of stop-words. I also implemented web scraping using Selenium and Beautiful Soup to get access to valuable information such as Company names and Colleges, so as to build an archive to help identify information in the resumes. <br><br>I was inspired to do this project as my work as a recruiter requires me to sift through resumes all the time and identify high-priority candidates. This was my first foray into turning unstructured data into a dataset and I welcome any feedback or insight to improve the code. Cheers!

In [128]:
import pandas as pd
import os
from os import walk

import io
import re
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk import ngrams
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

import spacy
import locationtagger
 
# essential entity models downloads
# nltk.downloader.download('maxent_ne_chunker')
# nltk.downloader.download('words')
# nltk.downloader.download('treebank')
# nltk.downloader.download('maxent_treebank_pos_tagger')
# nltk.downloader.download('punkt')
# nltk.download('averaged_perceptron_tagger')

from pdfminer.high_level import extract_text
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys

##### Code below to locate absolute path of a File

In [2]:
simple_path = 'ChrisTarzianResume2022.pdf'
abs_path = os.path.abspath(simple_path)
print(abs_path)

c:\Users\Chris Tarzian\Documents\Post-Fordham\Visual Studio Code\PDF Resume Parser\ChrisTarzianResume2022.pdf


In [3]:
##r is ver important here
## setting the working directory
os.chdir(r"c:\Users\Chris Tarzian\Documents\Post-Fordham\Visual Studio Code\PDF Resume Parser\Resumes2")
os.getcwd()

'c:\\Users\\Chris Tarzian\\Documents\\Post-Fordham\\Visual Studio Code\\PDF Resume Parser\\Resumes2'

## <span style='color:Green '> Importing PDF Files/Resumes </span> 

In [4]:
#setting resume files as all the pdf's inside the directory
resume_files = [file for file in os.listdir('.') if os.path.isfile(file) and file.endswith('.pdf')]
resume_files

['Aram_Keshgegian_Resume.pdf',
 'ArbyTorossianResume.pdf',
 'ChrisTarzianResume2022.pdf',
 'DeanDerSimonianResume.pdf',
 'MT Resume-1.pdf']

In [264]:
text1 = extract_text('ChrisTarzianResume2022.pdf')
text2 = extract_text('DeanDersimonianResume.pdf')

## <span style='color:Green '> Get Full Name Function </span> 

In [6]:
def get_full_name(name):   
    #text = name.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r"[,.;/@#?!&$]+\ *", " ", name)
        ## lower extracted text
    text = text.lower()

    ngram_list = [2,3]
    name_grams= []

    for i in ngram_list:
            ## creating ngrams
            grams = list(ngrams([token for token in text.split(" ") if len(token) > 1],i))
            ## looping through to match bigrams
            for g in grams:
                    grams = list(map(' '.join, nltk.everygrams(g, i)))
                    grams = [each_string.lower() for each_string in grams]
                    name_grams.append(grams[0].title())
                    break
    
    return name_grams[0]

In [7]:
get_full_name(text1)

'Chris Tarzian'

## <span style='color:Green '> Get Phone Number Function </span> 

In [83]:
def get_phone_number(text):
    ### define phone number input
    phone_number = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)
    
    if phone_number:
        number = ''.join(phone_number[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

In [84]:
get_phone_number(text1)

'2019567519'

## <span style='color:Green '> Get Email Function </span> 

In [10]:
def get_email(email):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

In [11]:
get_email(text1)

'christarzian@gmail.com'

## <span style='color:Green '> Get Location Function </span> 

In [275]:
def get_location(location):   
    # extracting entities.
    place_entity = locationtagger.find_locations(text = location)

    # initializing a list
    location_list = []

    ## appending both types of cities
    location_list.append(place_entity.regions)
    location_list.append(place_entity.region_cities)

    ## creating a loop to identify if its a dictionary or a list and taking the first instance
    if len(location_list[0]) == 0:
        location_list = location_list[1]
        location_list = list(location_list.keys())[0]
    else:
        location_list = location_list[0]
        location_list = ' '.join(location_list)

    return location_list

In [276]:
get_location(text1)

'New York'

## <span style='color:Green '> Get Education Level Function </span> 

In [12]:
def get_edu_level(edu_level):
        ## list of types of degree options
        edu_level_list = ['BA','BE','B.E.', 'B.E', 'BS', 'B.S', 
                    'MS', 'M.S.', 'MBA', 'M.B.A.', 'M.B.A'
                    'phd', 'PhD', 'Ph.D.', 'Ph.D','MSEI',
                    'BACHELOR OF SCIENCES', 'BACHELOR OF SCIENCE', 'BACHELOR OF ARTS', 'BACHELOR OF ENGINEERING', 
                    'BACHELOR OF MUSIC', 'BACHELOR OF EDUCATION', 'BACHELORS OF BUSINESS ADMINISTRATION',
                    'BACHELOR OF BUSINESS ADMINISTRATION', 'BACHELOR IN BUSINESS ADMINISTRATION', 'BACHELORS IN BUSINESS ADMINISTRATION',
                    'MASTER OF MUSIC', 'MASTER OF EDUCATION', 'MASTER OF SCIENCE', 
                    'MASTERS', 'MASTER OF ACCOUNTING','MASTER OF BUSINESS ADMINISTRATION']
                    
        ## cleaning text of punctuation            
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", edu_level)
        ## creating list of ngram numbers to create
        ngram_list = [1,2,3,4]
        ## creating a set to store all final ngram values
        edu_level_grams = set()
        for i in ngram_list:
                ## creating ngrams, iterating through ngram list
                edu_grams = ngrams(text.split(), i)
                ## looping through to match grams
                for grams in edu_grams:
                        ## joining each ngram, iterating through ngram list
                        edu_grams = list(map(' '.join, nltk.everygrams(grams, i)))
                        edu_grams = [each_string.upper() for each_string in edu_grams]
                        for grams in edu_grams:
                                ## if grams in both lists appending to set
                                if grams in edu_level_list:
                                        edu_level_grams.add(grams)

        # convert Set to String
        edu_level_grams = ', '.join(edu_level_grams)

        return edu_level_grams 

In [13]:
get_edu_level(text1)

'MASTER OF SCIENCE, BACHELOR OF BUSINESS ADMINISTRATION'

## <span style='color:Green '> Get Skills Function </span> 

#### <span style='color:Green '> Web Scraping for Skills </span> 

Note: web scrape code below is current but companies can change their HTML

In [14]:
# providing url
url = 'https://www.jobscan.co/blog/top-resume-keywords-boost-resume/'
  
# creating request object
req = requests.get(url)
  
# creating soup object
data = BeautifulSoup(req.text, 'html')

##initializing list to store skills
skills_list = []

# finding all li tags in ol
data1 = data.find('ol')
for li in data1.find_all("li"):
    skills_list.append(li.text)

data2 = data.find('ol', start='26')
for li in data2.find_all("li"):
    skills_list.append(li.text)

data3 = data.find('ol', start='51')
for li in data3.find_all("li"):
    skills_list.append(li.text)

data4 = data.find('ol', start='76')
for li in data4.find_all("li"):
    skills_list.append(li.text)

data5 = data.find('ol', start='101')
for li in data5.find_all("li"):
    skills_list.append(li.text)

data6 = data.find('ol', start='126')
for li in data6.find_all("li"):
    skills_list.append(li.text)

data7 = data.find('ol', start='151')
for li in data7.find_all("li"):
    skills_list.append(li.text)

data8 = data.find('ol', start='326')
for li in data8.find_all("li"):
    skills_list.append(li.text)

In [15]:
#checking to see length is correct, scraped 500 skills
print(len(skills_list))
del skills_list[0]
skills_df = pd.DataFrame(skills_list, columns=['Skills'])
skills_df.to_csv('skills_list.csv', index=False)

In [16]:
skills_df = pd.read_csv('skills_list.csv')
skills_list = skills_df['Skills'].tolist()
skills_list = [each_string.lower() for each_string in skills_list]

#### <span style='color:Green '> Skills Function </span> 

In [17]:
def get_skills(skill):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    word_tokens = nltk.tokenize.word_tokenize(skill)
 
    # remove the stop words
    filtered_tokens = [w for w in word_tokens if w not in stop_words]
 
    # remove the punctuation
    filtered_tokens = [w for w in word_tokens if w.isalpha()]
 
    # generate bigrams and trigrams (such as artificial intelligence)
    bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
 
    # we create a set to keep the results in.
    found_skills = set()
  
    # we search for each token in our skills database
    for token in filtered_tokens:
        if token.lower() in skills_list:
            found_skills.add(token.title())
 
    # we search for each bigram and trigram in our skills database
    for ngram in bigrams_trigrams:
        if ngram.lower() in skills_list:
            found_skills.add(ngram.title())
    
    # convert Set to String
    found_skills = ', '.join(found_skills)

    return found_skills

In [18]:
get_skills(text1)

'Machine Learning, Mining, Mis, Python, Sports, Sourcing, Analytics, Technical Skills, Modeling, Linux, Recruiting, Technical, Sql, Data Analysis, Social Media, Tableau, Researching, Content, Analysis, Programming, Marketing, Finance'

## <span style='color:Green '> Get College Function </span> 

#### <span style='color:Green '> Web Scraping College/Universities </span> 

Note: web scrape code below is current but companies can change their HTML

In [19]:
# providing url
url2 = 'https://www.4icu.org/us/a-z/'
  
# creating request object
req2 = requests.get(url2)
  
# creating soup object
soup = BeautifulSoup(req2.text, 'html')

col_data = soup.find_all("a")
college_list = []

for col in col_data:
    college_list.append(col.text)   

In [20]:
# print(college_list.index('A.T. Still University'))
# print(college_list.index('Youngstown State University'))
college_list = college_list[52:1817]
college_df = pd.DataFrame(college_list, columns=['Universities'])
college_df.to_csv('college_list.csv', index=False)

In [21]:
college_df = pd.read_csv('college_list.csv')
college_list = college_df['Universities'].tolist()

In [22]:
college_list = [each_string.lower() for each_string in college_list]
college_list[329] = 'columbia university'

#### <span style='color:Green '> College Function </span> 

In [23]:
def get_college(college):
        #text = college.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", college)
        ## lower extracted text
        text = text.lower()

        ngram_list = [2,3,4,5,6,7]
        college_grams= []

        for i in ngram_list:
                ## creating ngrams
                grams = ngrams(text.split(), i)
                ## looping through to match bigrams
                for g in grams:
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        grams = [each_string.lower() for each_string in grams]
                        for g in grams:
                                if g in college_list:
                                        college_grams.append(g.title())

        # convert Set to String
        college_grams = ', '.join(college_grams)

        return college_grams 

In [24]:
get_college(text1)

'Fordham University, Villanova University'

## <span style='color:Green '> Get Job Title Function </span> 

#### <span style='color:Green '> Web Scraping CareerBuilder for Job Titles </span> 

Note: web scrape code below is current but companies can change their HTML

In [25]:
driver_path = r'c:/Users/Chris Tarzian/Desktop/chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)

# providing url
url3 = 'https://www.careerbuilder.com/browse'
driver.get(url3)

alphabet = list(string.ascii_lowercase)
job_title_list = []

for i in alphabet:   
    
    driver.find_element_by_xpath(f"//a[@href='/browse/titles/{i}']").click()
   
    page_source = driver.page_source
    data3 = BeautifulSoup(page_source, 'html')

    jobtitle_content = data3.find_all('a', class_='col-mobile-full')

    for k in jobtitle_content:
        job_title_list.append(k.text)

    time.sleep(3)

driver.quit()

In [26]:
job_title_df = pd.DataFrame(job_title_list, columns=['Job Title'])
job_title_df.to_csv('job_title_list.csv', index=False)

In [27]:
job_title_df = pd.read_csv('job_title_list.csv')
job_title_list = job_title_df['Job Title'].tolist()
job_title_list = [each_string.lower() for each_string in job_title_list]

#### <span style='color:Green '> Job Title Function </span> 

In [28]:
def get_job_title(title):
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", title)
        ## lower extracted text
        text = text.lower()

        ngram_list = [1,2,3]
        job_title_grams= set()

        for i in ngram_list:
                ## creating ngrams
                grams = ngrams(text.split(), i)
                ## looping through to match bigrams
                for g in grams:
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        grams = [each_string.lower() for each_string in grams]
                        for g in grams:
                                if g in job_title_list:
                                        job_title_grams.add(g.title())
        
        # convert Set to String
        job_title_grams = ', '.join(job_title_grams)
                                      
        return job_title_grams 

In [29]:
get_job_title(text1)

'Administration, Data Analyst, Financial Analyst, Recruiter, Finance, Technical, Business, Education, Financial, School, Senior Recruiter, Director, Ceo, Marketing, Analyst, Analytics, Science'

## <span style='color:Green '> Get Company Function </span> 

#### <span style='color:Green '> Web Scraping LinkedIn for Public Companies </span> 

Note: web scrape code below is current but companies can change their HTML

In [30]:
driver_path = r'c:/Users/Chris Tarzian/Desktop/chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)

# providing url
url4 = 'https://www.linkedin.com/directory/companies'
driver.get(url4)

driver.get('https://www.linkedin.com/login')
# my secret credentials:
email = "######@gmail.com"
password = "########"
# Go to linkedin and login

time.sleep(3)
driver.find_element_by_id('username').send_keys(email)
driver.find_element_by_id('password').send_keys(password)
driver.find_element_by_id('password').send_keys(Keys.RETURN)

In [31]:
url4 = 'https://www.linkedin.com/directory/companies'
driver.get(url4)

company_list = []
alphabet = list(string.ascii_lowercase)

for i in alphabet:
    driver.find_element_by_xpath(f"//a[@href='https://www.linkedin.com/directory/companies/{i}?trk=companies_directory_letter_nav']").click()

    page_source = driver.page_source
    data4 = BeautifulSoup(page_source, 'html')

    company_content = data4.find_all('a', class_='listings__entry-link')

    for k in company_content:
        company_list.append(k.text)

driver.quit()

In [32]:
company_df = pd.DataFrame(company_list, columns=['Company'])
company_df.to_csv('company_list.csv', index=False)

In [33]:
company_df = pd.read_csv('company_list.csv')
company_list = company_df['Company'].tolist()

In [34]:
len(company_list)

51993

In [35]:
skills_list = [each_string.title() for each_string in skills_list]
company_list = [w for w in company_list if w not in skills_list]
skills_list = [each_string.lower() for each_string in skills_list]
company_list.remove('Y')
company_list.remove('X')
company_list.remove('University')
company_list.remove('CEO')
company_list.remove('G2')
company_list.remove('Hiring')
company_list.remove('Foundation')
company_list.remove('Digital Business')
company_list.remove('One')
company_list.remove('Line')
company_list.remove('Marketing Digital')
company_list.remove('Analysts')

#### <span style='color:Green '> Company Function </span> 

In [36]:
def get_company(company):
        #text = .translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", company)
        ## lower extracted text
        #text = company

        ngram_list = [1,2,3,4,5]
        company_grams = set()

        for i in ngram_list:
                ## creating ngrams
                grams = ngrams(text.split(), i)
                ## looping through to match bigrams
                for g in grams:
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        #grams = [each_string.title() for each_string in grams]
                        for g in grams:
                                if g in company_list:
                                        company_grams.add(g) 
        # convert Set to String
        company_grams = ', '.join(company_grams)
        
        return company_grams                         

In [37]:
get_company(text1)

'Deloitte, Recruiter, Sapient, Chase, EY, YOH, Figma, HBO, Razorfish, Spotify, Verizon, LinkedIn'

## <span style='color:Green '> Combining All Functions </span> 

In [271]:
class all_funcs():
    
    ## get full name function         
    def get_full_name(name):   
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", name)
        ## lower extracted text
        text = text.lower()

        ngram_list = [2,3]
        name_grams= []

        for i in ngram_list:
                ## to remove middle initial of name, keeping tokens longer than 1 initial and creating ngrams
                grams = list(ngrams([token for token in text.split(" ") if len(token) > 1],i))
                ## looping through to match bigrams
                for g in grams:
                        ## joining ngrams, and iterating through all option lengths in ngram_list
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        grams = [each_string.lower() for each_string in grams]
                        name_grams.append(grams[0].title())
                        break

        return name_grams[0]
            
    ## get phone number function
    def get_phone_number(phone):
        ### define phone number input
        phone_number = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), phone)
        
        if phone_number:
            number = ''.join(phone_number[0])
            if len(number) > 10:
                return '+' + number
            else:
                return number

    ## Get email function
    def get_email(email):
            email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
            if email:
                try:
                    return email[0].split()[0].strip(';')
                except IndexError:
                    return None

    ## Get Location Function                
    def get_location(location):   
        # extracting entities.
        place_entity = locationtagger.find_locations(text = location)

        # initializing a list
        location_list = []

        ## appending both types of cities
        location_list.append(place_entity.regions)
        location_list.append(place_entity.region_cities)

        ## creating a loop to identify if its a dictionary or a list and taking the first instance
        if len(location_list[0]) == 0:
            location_list = location_list[1]
            location_list = list(location_list.keys())[0]
        else:
            location_list = location_list[0]
            location_list = ' '.join(location_list)

        return location_list
        
    ## Get College function
    def get_college(college):
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", college)
        ## lower extracted text
        text = text.lower()

        ngram_list = [2,3,4,5,6,7]
        college_grams = set()

        for i in ngram_list:
                ## creating ngrams
                grams = list(ngrams(text.split(), i))
                ## looping through to match grams
                for g in grams:
                         ## joining ngrams, and iterating through all option lengths in ngram_list
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        grams = [each_string.lower() for each_string in grams]
                        for g in grams:
                                ## if ngram in college list, then adding to college grams set
                                if g in college_list:
                                        college_grams.add(g.title())

        ## convert Set to String
        college_grams = ', '.join(college_grams)

        return college_grams 
        
    ## Get Education Level Function
    def get_edu_level(edu_level):
        ## list of types of degree options
        edu_level_list = ['BA','B.A.','B.E.', 'B.E', 'BS', 'B.S', 'B.S.',
                    'MS', 'M.S.', 'MBA', 'M.B.A.', 'M.B.A'
                    'phd', 'PhD', 'Ph.D.', 'Ph.D','MSEI',
                    'BACHELOR OF SCIENCES', 'BACHELOR OF SCIENCE', 'BACHELOR OF ARTS', 'BACHELOR OF ENGINEERING', 
                    'BACHELOR OF MUSIC', 'BACHELOR OF EDUCATION', 'BACHELORS OF BUSINESS ADMINISTRATION',
                    'BACHELOR OF BUSINESS ADMINISTRATION', 'BACHELOR IN BUSINESS ADMINISTRATION', 'BACHELORS IN BUSINESS ADMINISTRATION',
                    'MASTER OF MUSIC', 'MASTER OF EDUCATION', 'MASTER OF SCIENCE', 
                    'MASTERS', 'MASTER OF ACCOUNTING','MASTER OF BUSINESS ADMINISTRATION']
                    
        ## cleaning text of punctuation            
        text = re.sub(r"[,;/@#?!&$]+\ *", " ", edu_level)
        ## creating list of ngram numbers to create
        ngram_list = [1,2,3,4]
        ## creating a set to store all final ngram values
        edu_level_grams = set()
        for i in ngram_list:
                ## creating ngrams, iterating through ngram list
                edu_grams = list(ngrams(text.split(), i))
                ## looping through to match grams
                for grams in edu_grams:
                        ## joining each ngram, iterating through ngram list
                        edu_grams = list(map(' '.join, nltk.everygrams(grams, i)))
                        edu_grams = [each_string.upper() for each_string in edu_grams]
                        for grams in edu_grams:
                                ## if grams in both lists appending to set
                                if grams in edu_level_list:
                                        edu_level_grams.add(grams)
        ## convert Set to String
        edu_level_grams = ', '.join(edu_level_grams)

        return edu_level_grams

    ## Get Company Function
    def get_company(company):
        ##cleaning punctuation
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", company)
        
        ngram_list = [1,2,3,4,5]
        company_grams = set()

        for i in ngram_list:
                ## creating ngrams
                grams = ngrams(text.split(), i)
                ## looping through to match bigrams
                for g in grams:
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        #grams = [each_string.title() for each_string in grams]
                        for g in grams:
                                if g in company_list:
                                        company_grams.add(g)
        ## convert Set to String
        company_grams = ', '.join(company_grams)
        
        return company_grams 

    ## Get job title function
    def get_job_title(title):
        text = re.sub(r"[,.;/@#?!&$]+\ *", " ", title)
        ## lower extracted text
        text = text.lower()

        ngram_list = [2,3,4,5]
        job_title_grams= set()

        for i in ngram_list:
                ## creating ngrams
                grams = ngrams(text.split(), i)
                ## looping through to match bigrams
                for g in grams:
                        grams = list(map(' '.join, nltk.everygrams(g, i)))
                        grams = [each_string.lower() for each_string in grams]
                        for g in grams:
                                if g in job_title_list:
                                        job_title_grams.add(g.title())
        ## convert Set to String
        job_title_grams = ', '.join(job_title_grams)
                                        
        return job_title_grams 
        
    ## Get Skills Function
    def get_skills(skill):
        stop_words = set(nltk.corpus.stopwords.words('english'))
        word_tokens = nltk.tokenize.word_tokenize(skill)
    
        # remove the stop words
        filtered_tokens = [w for w in word_tokens if w not in stop_words]
    
        # remove the punctuation
        filtered_tokens = [w for w in word_tokens if w.isalpha()]
    
        # generate bigrams and trigrams (such as artificial intelligence)
        bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3)))
    
        # we create a set to keep the results in.
        found_skills = set()
    
        # we search for each token in our skills database
        for token in filtered_tokens:
            if token.lower() in skills_list:
                found_skills.add(token.title())
    
        # we search for each bigram and trigram in our skills database
        for ngram in bigrams_trigrams:
            if ngram.lower() in skills_list:
                found_skills.add(ngram.title())

        ## convert Set to String
        found_skills = ', '.join(found_skills)

        return found_skills    

In [272]:
print(all_funcs.get_full_name(text1))
print(all_funcs.get_phone_number(text1))
print(all_funcs.get_email(text1))
print(all_funcs.get_location(text1))
print(all_funcs.get_college(text1))
print(all_funcs.get_edu_level(text1))
print(all_funcs.get_company(text1))
print(all_funcs.get_job_title(text1))
print(all_funcs.get_skills(text1))

Chris Tarzian
2019567519
christarzian@gmail.com
New York
Villanova University, Fordham University
MASTER OF SCIENCE, BACHELOR OF BUSINESS ADMINISTRATION
Deloitte, Recruiter, Sapient, Chase, EY, YOH, Figma, HBO, Razorfish, Spotify, Verizon, LinkedIn
Data Analyst, Senior Recruiter, Financial Analyst
Machine Learning, Mining, Mis, Python, Sports, Sourcing, Analytics, Technical Skills, Modeling, Linux, Recruiting, Technical, Sql, Data Analysis, Social Media, Tableau, Researching, Content, Analysis, Programming, Marketing, Finance


## <span style='color:Green '> Data In Structured Form </span> 

In [269]:
full_name = []
phone_number = []
email = []
location = []
education = []
edu_level = []
skills = []
college = []
job_title = []
company = []

## looping through resume directory
for info in resume_files:
    ## extracting text from each resume
    text = extract_text(info)
    ## running extracted text from all previously defined functions
    full_name.append(all_funcs.get_full_name(text))
    phone_number.append(all_funcs.get_phone_number(text))
    email.append(all_funcs.get_email(text))
    location.append(all_funcs.get_location(text))
    edu_level.append(all_funcs.get_edu_level(text))
    college.append(all_funcs.get_college(text))
    company.append(all_funcs.get_company(text))
    job_title.append(all_funcs.get_job_title(text))
    skills.append(all_funcs.get_skills(text))

#gathering data into a dictionary
data = {
    'Full Name':full_name,
    'Phone_Number':phone_number,
    'Email': email,
    'Location': location,
    'College':college,
    'Education_Level' : edu_level,
    'Company' : company,
    'Job_Title' : job_title,
    'Skills':skills
}
## converting dictionary into a df
resume_df = pd.DataFrame(data)

In [270]:
resume_df.head()

Unnamed: 0,Full Name,Phone_Number,Email,Location,College,Education_Level,Company,Job_Title,Skills
0,Aram Keshgegian,6107166226,aram.keshgegian@vertexinc.com,Yerevan,Drexel University,"MASTER OF BUSINESS ADMINISTRATION, BACHELOR OF...","Mother, King, Vertex Inc, Performance Food Gro...","Inside Sales Representative, Sales Development...","Partnership, Sales, Crm, Oracle, Process Impro..."
1,Arby Torossian,6176788333,Arby.Torossian@gmail.com,Massachusetts New York,University Of Massachusetts Boston,B.S.,"Wells Fargo, Wayfair, Canva, Toast, Looker, We...","Operations Analyst, Data Analyst, Internationa...","Operations, Sales, Transactions, Warehouse, Qu..."
2,Chris Tarzian,2019567519,christarzian@gmail.com,New York,"Villanova University, Fordham University","MASTER OF SCIENCE, BACHELOR OF BUSINESS ADMINI...","Deloitte, Recruiter, Sapient, Chase, EY, YOH, ...","Data Analyst, Senior Recruiter, Financial Analyst","Machine Learning, Mining, Mis, Python, Sports,..."
3,Dean Dersimonian,6109373511,ddersimon@gmail.com,Pennsylvania,Ursinus College,BACHELOR OF ARTS,"SEI, Outside, Vertex Inc, King","Inside Sales Representative, Data Entry, Solut...","Sales, Data Entry, Oracle, Value Proposition, ..."
4,Matthew Tarzian,172019567518,mtarzian1@gmail.com,New York,Villanova University,B.A.,"HomeAdvisor, Angi, Van Heusen, SAP, Google, LL...",Account Executive,"Staffing, Sales, Crm, Oracle, Retention, Video..."
