In [1]:
import pandas as pd
import re 
import spacy 
import os 
import numpy as np
import altair as alt
import textdistance
from sklearn.preprocessing import MultiLabelBinarizer
alt.data_transformers.enable('json')
pd.options.display.max_colwidth = 100

In [2]:
nlp = spacy.load("en_core_web_sm")

In [6]:
extract = pd.read_excel("../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/01_resume_scan_data/manual_extraction_template.xlsx")
extract.columns = extract.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

In [7]:
extract.head(1)

Unnamed: 0,employee_code,employee_name,found,work1_title,work1_company,work1_time,work2_title,work2_company,work2_time,work3_title,...,education2_concentration,education2_country,education3_school,education3_degree,education3_concentration,education3_country,education4_school,education4_degree,education4_concentration,education4_country
0,BDP,"Abdulali, Kaizar",clara,Sales Manager (Samsung Experience Store),Glentel Inc.,Nov 2015 - Present,Sales Manager (Wireless Wave & T-Booth),Glentel Inc.,Aug 2010 - Nov 2015,Sales Associate (Wireless Wave & T-Booth),...,Economic,Canada,,,,,,,,


In [8]:
def preprocess(text): 
    
    text = str(text)
    text = text.lower()
    text = text.strip()
    text = re.sub('\W+',' ', text)
    text = re.sub("[\d-]", '', text)
    
    # Replace a sequence of whitespaces by a single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove other strange characters
    text = re.sub(r'''[\*\~]+''', "", text)
    
    #hard code changes 
    if text == 'nan': text = "not_specified"
    
    
    #change text 
    text = re.sub("part time", '', text)
    text = re.sub("seasona[^\s]+", '', text)
    text = re.sub("superv[^\s]+", 'manager', text)
    text = re.sub("team.+lea[^\s]+", 'manager', text)
    text = re.sub("wait[er][er]", 'server', text)
    text = re.sub("chie[^\s]+", 'culinary', text)
    text = re.sub("chef", 'culinary', text)
    text = re.sub("bran[^\s]+", 'sales', text)
    text = re.sub("instr[^\s]+", 'teacher', text)
    text = re.sub("gues[^\s]+", 'customer', text)
    text = re.sub("clien[^\s]+", 'customer', text)
    text = re.sub("kios[^\s]+", 'retail', text)
    text = re.sub("sale[^s]", 'sales', text)
    text = re.sub("specia[^s]", '', text)
    
    
    #group jobtiles
    #sales associate
    if re.search('mobil[^\s]+', text): text = 'sales associate'
    if re.search('sales a([^\sn]+)', text): text = 'sales associate'
    if re.search('sal.+re[^\s]+', text): text = 'sales associate'
    if re.search('sales.+[pca]', text): text = 'sales associate'
    if re.search('sales.+ex[^\s]+', text): text = 'sales associate'
    
    #assistant manager
    if re.search('assis.+ manager', text): text = 'assistant manager'

    #manager
    if re.search('s[ta][ol][er].+manager', text): text = 'manager'
    if re.search('service.+manager', text): text = 'manager'
    if re.search('sale.+man', text): text = 'manager'
    if any(x in text for x in ['manager']) and any(x not in text for x in ['assistant']): text = 'manager'
    
        
    #customer service representativ
    if re.search('customer.+[sc][^\s]+', text): text = 'customer service representative'
    if re.search('clerk', text): text = 'customer service representative'
    if re.search('team [m]', text): text = 'customer service representative'
        
    #server
    if re.search('serve', text): text = 'server'
    if re.search('barte', text): text = 'server'
    if re.search('host', text): text = 'server'
        
    #cashier
    if re.search('cashi', text): text = 'cashier'
        
    #education
    if re.search('teach[^\s]+', text): text = 'education'
    if re.search('lectur[^\s]+', text): text = 'education'
    if re.search('tutor', text): text = 'education'
    if re.search('educat', text): text = 'education'
    if re.search('student', text): text = 'education'
    if re.search('education', text): text = 'education'
     
    #culinary
    if re.search('culinary', text): text = 'culinary'
    if re.search('kitchen', text): text = 'culinary'
    if re.search('cook', text): text = 'culinary'
        
    #adminstration
    if re.search('administr[^\s]+', text): text = 'administrative'
    if re.search('office', text): text = 'administrative'
    if re.search('executi', text): text = 'administrative'
    if re.search('coordinator', text): text = 'administrative'
    if re.search('auditor', text): text = 'administrative'
        
    #driver
    if re.search('drive[^\s]+', text): text = 'driver'
    if re.search('deliv', text): text = 'driver'
        
    #blue collar
    if re.search('labo[^\s]+', text): text = 'blue collar'
    if re.search('electrici', text): text = 'blue collar'
    if re.search('plumber', text): text = 'blue collar'
    if re.search('carpent', text): text = 'blue collar'
    if re.search('construc', text): text = 'blue collar'
    if re.search('renovat', text): text = 'blue collar'
    if re.search('manpower', text): text = 'blue collar'
        
    #technicians
    if re.search('technici', text): text = 'technician'
        
    #fitness/sports
    if re.search('coach', text): text = 'fitness/sports'
    if re.search('fitnes', text): text = 'fitness/sports'
    if re.search('traine', text): text = 'fitness/sports'
    if re.search('referee', text): text = 'fitness/sports'
    
    #financial services
    if re.search('financia', text): text = 'financial services' 
    if re.search('analy', text): text = 'financial services'
    if re.search('bookee', text): text = 'financial services'
    if re.search('mortgag', text): text = 'financial services'
    if re.search('broker', text): text = 'financial services'
    
    #telemarketers
    if re.search('call cent', text): text = 'telemarketers'
    
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    if text == '': text = "not_specified"
    return text

In [9]:
col_num = 1
extract['work'+str(col_num)+'_title_clean'] = extract['work'+str(col_num)+'_title'].apply(preprocess)

In [10]:
# extract['work'+str(col_num)+'_title_clean'].to_clipboard()

In [11]:
alt.Chart(extract).mark_bar().encode(
    x="count()",
    y='work'+str(col_num)+'_title_clean'

)

In [12]:
categorical_list = ['sales associate', 'assistant manager', 'customer service representative', 'financial services', 'telemarketers', 'fitness/sports', 'not_specified', 'manager', 'server']

def worktitle_labeling(text, categorical_list=categorical_list, word_tolarance = .3):
    label = ""
    if any(x in text for x in categorical_list):
        label = text
    else:
        results = {}
        for i in range(0, len(categorical_list)):
            simil = textdistance.levenshtein.normalized_similarity(text, categorical_list[i])
            results[str(categorical_list[i])] = simil
            results_sort = sorted(results.items(), key=lambda x: x[1], reverse=True)

        if results_sort[0][1] > word_tolarance:
            label = results_sort[0][0]
        else:
            label = "other_jobtitle"
    return label

In [13]:
for i in range(1, 8):
    col_num = i
    extract['work'+str(col_num)+'_title_clean'] = extract['work'+str(col_num)+'_title'].apply(preprocess)
    
for i in range(1, 8):
    col_num = i    
    extract['work'+str(col_num)+'_title_label'] = extract['work'+str(col_num)+'_title_clean'].apply(worktitle_labeling)

In [14]:
#test the lables 
col_num = 1
alt.Chart(extract).mark_bar().encode(
    x="count()",
    y='work'+str(col_num)+'_title_label'

)

In [15]:
result = []
for i in range(0, len(extract['work1_title_label'])):
    lst = []
    if (str(extract['work1_title_label'][i]) == 'not_specified' and str(extract['work2_title_label'][i]) == 'not_specified' and str(extract['work3_title_label'][i]) == 'not_specified' and str(extract['work4_title_label'][i]) == 'not_specified' and str(extract['work5_title_label'][i]) == 'not_specified' and str(extract['work6_title_label'][i]) == 'not_specified'and str(extract['work7_title_label'][i]) == 'not_specified'):
        lst.append("no_work_title")
    if extract['work1_title_label'][i] != 'not_specified':
        lst.append(extract['work1_title_label'][i])
    if extract['work2_title_label'][i] != 'not_specified':
        lst.append(extract['work2_title_label'][i])
    if extract['work3_title_label'][i] != 'not_specified':
        lst.append(extract['work3_title_label'][i])
    if extract['work4_title_label'][i] != 'not_specified':
        lst.append(extract['work4_title_label'][i])
    if extract['work5_title_label'][i] != 'not_specified':
        lst.append(extract['work5_title_label'][i])
    if extract['work6_title_label'][i] != 'not_specified':
        lst.append(extract['work6_title_label'][i])
    if extract['work7_title_label'][i] != 'not_specified':
        lst.append(extract['work7_title_label'][i])
    result.append(lst)

extract["work_title_list"] = result

In [16]:
mlb = MultiLabelBinarizer()
extract = extract.join(pd.DataFrame(mlb.fit_transform(extract.pop('work_title_list')),
                                   columns = mlb.classes_,
                                   index= extract.index))

In [17]:
extract_work_title = extract.drop(['found', 'work1_title',
       'work1_company', 'work1_time', 'work2_title', 'work2_company',
       'work2_time', 'work3_title', 'work3_company', 'work3_time',
       'work4_title', 'work4_company', 'work4_time', 'work5_title',
       'work5_company', 'work5_time', 'work6_title', 'work6_company',
       'work6_time', 'work7_title', 'work7_company', 'work7_time',
       'education1_school', 'education1_degree', 'education1_concentration',
       'education1_country', 'education2_school', 'education2_degree',
       'education2_concentration', 'education2_country', 'education3_school',
       'education3_degree', 'education3_concentration', 'education3_country',
       'education4_school', 'education4_degree', 'education4_concentration',
       'education4_country', 'work1_title_clean', 'work2_title_clean',
       'work3_title_clean', 'work4_title_clean', 'work5_title_clean',
       'work6_title_clean', 'work7_title_clean'], axis=1)

In [18]:
categorical_list = ['sales associate', 'assistant manager', 'customer service representative', 'cashier', 'education', 'cook', 'administrative', 'driver', 'blue collar', 'technicians', 'financial services', 'telemarketers', 'fitness/sports', 'not_specified', 'manager', 'server']
final_label = categorical_list.copy() 

for i in range(0, len(categorical_list)):
    final_label[i] = re.sub('\s+', "_", categorical_list[i].strip())
    final_label[i] = re.sub('/', "_", final_label[i])
    final_label[i] = final_label[i]+"_jobtitle"

In [19]:
results ={}
for i in range(0, len(categorical_list)):
    results[categorical_list[i]] = final_label[i]

In [20]:
results

{'sales associate': 'sales_associate_jobtitle',
 'assistant manager': 'assistant_manager_jobtitle',
 'customer service representative': 'customer_service_representative_jobtitle',
 'cashier': 'cashier_jobtitle',
 'education': 'education_jobtitle',
 'cook': 'cook_jobtitle',
 'administrative': 'administrative_jobtitle',
 'driver': 'driver_jobtitle',
 'blue collar': 'blue_collar_jobtitle',
 'technicians': 'technicians_jobtitle',
 'financial services': 'financial_services_jobtitle',
 'telemarketers': 'telemarketers_jobtitle',
 'fitness/sports': 'fitness_sports_jobtitle',
 'not_specified': 'not_specified_jobtitle',
 'manager': 'manager_jobtitle',
 'server': 'server_jobtitle'}

In [21]:
extract_work_title = extract_work_title.rename(columns=results)

In [22]:
extract_work_title

Unnamed: 0,employee_code,employee_name,work1_title_label,work2_title_label,work3_title_label,work4_title_label,work5_title_label,work6_title_label,work7_title_label,assistant_manager_jobtitle,customer_service_representative_jobtitle,financial_services_jobtitle,fitness_sports_jobtitle,manager_jobtitle,no_work_title,other_jobtitle,sales_associate_jobtitle,server_jobtitle,telemarketers_jobtitle
0,BDP,"Abdulali, Kaizar",sales associate,sales associate,sales associate,not_specified,not_specified,not_specified,not_specified,0,0,0,0,0,0,0,1,0,0
1,F6Q,"Avila-Gil, Mario",sales associate,customer service representative,not_specified,not_specified,not_specified,not_specified,not_specified,0,1,0,0,0,0,0,1,0,0
2,MRN,"Amini, Kamyar",sales associate,manager,not_specified,not_specified,not_specified,not_specified,not_specified,0,0,0,0,1,0,0,1,0,0
3,MXO,"Awan, Shoaib",manager,fitness/sports,other_jobtitle,sales associate,not_specified,not_specified,not_specified,0,0,0,1,1,0,1,1,0,0
4,N0Y,"Al-Khulaidy, Maha",sales associate,sales associate,customer service representative,fitness/sports,not_specified,not_specified,not_specified,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,MUD,"Ryoo, Sungtak",other_jobtitle,other_jobtitle,manager,other_jobtitle,not_specified,not_specified,not_specified,0,0,0,0,1,0,1,0,0,0
285,L46,"Kapasi, Nikunj",financial services,sales associate,other_jobtitle,not_specified,not_specified,not_specified,not_specified,0,0,1,0,0,0,1,1,0,0
286,N87,"Mcinnis, Peter",other_jobtitle,manager,customer service representative,not_specified,not_specified,not_specified,not_specified,0,1,0,0,1,0,1,0,0,0
287,ND9,"Gamble, Justin",manager,other_jobtitle,sales associate,sales associate,not_specified,not_specified,not_specified,0,0,0,0,1,0,1,1,0,0


In [27]:
extract_work_title.describe()

Unnamed: 0,assistant_manager_jobtitle,customer_service_representative_jobtitle,financial_services_jobtitle,fitness_sports_jobtitle,manager_jobtitle,no_work_title,other_jobtitle,sales_associate_jobtitle,server_jobtitle,telemarketers_jobtitle
count,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0
mean,0.190311,0.311419,0.072664,0.062284,0.401384,0.010381,0.49827,0.806228,0.15917,0.031142
std,0.393228,0.463877,0.260035,0.242089,0.491029,0.101531,0.500864,0.395938,0.366469,0.174002
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
# extract_work_title.to_csv('../../../Glentel Inc/HR Analytics - Documents/Capstone Data/ubc_mds_team_share/06_clean_data/work_title.csv')