# Job Title Matching <a class="tocSkip"></a>
    
Based on a list of job titles / job descriptions, build a taxonomy of job titles, similar to:  
`Sales -> Manager`  
`Sales -> Representative`  



# Imports

## Import dependencies

In [1]:
import json

import pandas as pd

import spacy
from spacy import displacy

In [2]:
# pd.set_option('display.max_rows', 60)
# pd.set_option('display.max_columns', 20)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

print('display.max_rows    = %s' % pd.get_option('display.max_rows'))
print('display.max_columns = %s' % pd.get_option('display.max_columns'))

display.max_rows    = None
display.max_columns = None


In [3]:
%%time
nlp = spacy.load('en_core_web_sm')

CPU times: user 712 ms, sys: 47.5 ms, total: 759 ms
Wall time: 814 ms


## Import data

In [4]:
%%time
df = pd.read_csv('../data/raw/bman93_job/Top30.csv')

df.drop(columns=df.columns[:2], inplace=True)
df.Description = df.Description.replace({r'\\r': ''}, regex=True)
df.Description = df.Description.replace({r'\\n': ' \\n '}, regex=True)
# df['Description'] = df.Description.str.strip('\r')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72292 entries, 0 to 72291
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Query        72292 non-null  object
 1   Description  72292 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB
CPU times: user 1.54 s, sys: 139 ms, total: 1.68 s
Wall time: 1.68 s


In [5]:
df.head()

Unnamed: 0,Query,Description
0,Administrative Assistant,This Administrative Assistant position is resp...
1,Administrative Assistant,ADMINISTRATIVE ASSISTANT Part Time The West Or...
2,Administrative Assistant,Administrative Assistant - In Bus 26yrs Fashi...
3,Sales Representative,Are you ready for something new? Are you tired...
4,Customer Service Representative,Superior Staff Resources is currently seeking ...


In [6]:
doc = nlp(df.Description[0])
displacy.render(doc, style="ent")

# Split titles

In [7]:
df.Query.value_counts()

Administrative Assistant                                      4395
Customer Service Representative                               4200
Own Your Own Franchise!                                       3701
Sales Representative                                          3556
Mobile Tool Sales / Franchise Distributor                     3275
Retail Wireless Sales Consultant                              3194
Sales / Franchise                                             3120
Project Manager                                               2845
Staff Accountant                                              2834
Retail Sales Associate                                        2769
Business Analyst                                              2420
Sales / Customer Service – Part or Full time – Summer Work    2419
Sales Representative / Account Manager /  Customer Service    2412
Senior Accountant                                             2216
Jani-King Franchise Business Opportunity                      

In [8]:
df['Query_split'] = df.Query.str.split()
df['Query_word_len'] = df.Query_split.apply(len)

df.Query_word_len.value_counts(ascending=False)

2     36443
3     13537
4      9091
1      3292
6      3275
12     2419
8      2412
9      1823
Name: Query_word_len, dtype: int64

In [9]:
df[df.Query_word_len == 12].Query.iloc[0]

'Sales / Customer Service – Part or Full time – Summer Work'

In [10]:
df[df.Query_word_len == 2].head()

Unnamed: 0,Query,Description,Query_split,Query_word_len
0,Administrative Assistant,This Administrative Assistant position is resp...,"[Administrative, Assistant]",2
1,Administrative Assistant,ADMINISTRATIVE ASSISTANT Part Time The West Or...,"[Administrative, Assistant]",2
2,Administrative Assistant,Administrative Assistant - In Bus 26yrs Fashi...,"[Administrative, Assistant]",2
3,Sales Representative,Are you ready for something new? Are you tired...,"[Sales, Representative]",2
6,Java Developer,<P><STRONG>As a member of the Web and Portal D...,"[Java, Developer]",2


In [11]:
df[df.Query_word_len == 2].Query.value_counts()

Administrative Assistant    4395
Sales Representative        3556
Project Manager             2845
Staff Accountant            2834
Business Analyst            2420
Senior Accountant           2216
Benefits Consultant         2049
Store Manager               2021
Account Representative      1977
Account Executive           1876
Maintenance Technician      1834
Java Developer              1796
Financial Analyst           1754
Restaurant Manager          1679
Executive Assistant         1616
Physical Therapist          1575
Name: Query, dtype: int64

## Association dictionary

In [12]:
%%time

associations = {}
for row in df[df.Query_word_len == 2].itertuples():
    elem_a = row.Query_split[0]
    elem_b = row.Query_split[1]
    
    if elem_a in associations:
        if elem_b in associations[elem_a]:
            pass
        else:
            associations[elem_a].append(elem_b)
    else:
        associations[elem_a] = []


CPU times: user 57.3 ms, sys: 1.2 ms, total: 58.5 ms
Wall time: 57.5 ms


In [13]:
json.dump(associations, open('../data/processed/job_title_association.json', 'w'))

associations

{'Administrative': ['Assistant'],
 'Sales': ['Representative'],
 'Java': ['Developer'],
 'Financial': ['Analyst'],
 'Project': ['Manager'],
 'Executive': ['Assistant'],
 'Maintenance': ['Technician'],
 'Physical': ['Therapist'],
 'Store': ['Manager'],
 'Staff': ['Accountant'],
 'Account': ['Executive', 'Representative'],
 'Senior': ['Accountant'],
 'Business': ['Analyst'],
 'Restaurant': ['Manager'],
 'Benefits': ['Consultant']}

In [14]:
%%time

associations = {}
for row in df[df.Query_word_len == 2].itertuples():
    elem_a = row.Query_split[0]
    elem_b = row.Query_split[1]
    
    if elem_a in associations:
        if elem_b in associations[elem_a]:
            pass
        else:
            associations[elem_a].append(elem_b)
    else:
        associations[elem_a] = []
    
    if elem_b in associations:
        if elem_a in associations[elem_b]:
            pass
        else:
            associations[elem_b].append(elem_a)
    else:
        associations[elem_b] = []


CPU times: user 62.4 ms, sys: 888 µs, total: 63.3 ms
Wall time: 62.5 ms


In [15]:
json.dump(associations, open('../data/processed/job_title_association_sym.json', 'w'))

associations

{'Administrative': ['Assistant'],
 'Assistant': ['Administrative', 'Executive'],
 'Sales': ['Representative'],
 'Representative': ['Sales', 'Account'],
 'Java': ['Developer'],
 'Developer': ['Java'],
 'Financial': ['Analyst'],
 'Analyst': ['Financial', 'Business'],
 'Project': ['Manager'],
 'Manager': ['Project', 'Store', 'Restaurant'],
 'Executive': ['Assistant', 'Account'],
 'Maintenance': ['Technician'],
 'Technician': ['Maintenance'],
 'Physical': ['Therapist'],
 'Therapist': ['Physical'],
 'Store': ['Manager'],
 'Staff': ['Accountant'],
 'Accountant': ['Senior', 'Staff'],
 'Account': ['Executive', 'Representative'],
 'Senior': ['Accountant'],
 'Business': ['Analyst'],
 'Restaurant': ['Manager'],
 'Benefits': ['Consultant'],
 'Consultant': ['Benefits']}

## Noun extraction