In [None]:
import jsonlines
import pandas as pd
import re
from os.path import join
from hr_research.config import output_path
from collections import defaultdict 
from itertools import groupby

In [None]:
filepath = join(output_path, "flat_experience_titles.jsonl")
extra_data_filepath = join(output_path, "extra_flat_experience_titles.jsonl")

In [None]:
def normalize_title(title):
    title = title.lower().strip()
    title = title.replace('.', '')
    title = title.replace('/', ' ')
    title = title.replace('\\', ' ')
    title = title.replace('-', ' ')
    title = re.sub(r"\bsenior\b", "sr", title)
    title = re.sub(r"\bjunior\b", "jr", title)
    title = ' '.join(title.split()) # collapse whitespaces
    return title

In [None]:
def generate_job_seniority(normalized_jobs, seniority_matrix):
    l = len(normalized_jobs)
    for i in range(l-1):
        for j in range(i+1, l):
            pair = (normalized_jobs[i], normalized_jobs[j])
            if pair[0] == pair[1]:
                continue
            seniority_matrix[pair] += 1

In [None]:
def get_seniority(job, other_job, seniority_matrix):
    return seniority_matrix[(job, other_job)] / (seniority_matrix[(other_job, job)] + seniority_matrix[(job, other_job)])

In [None]:
SYNONYMS = defaultdict(set)
SENIORITY_MATRIX = defaultdict(lambda: 0)

with jsonlines.open(filepath, 'r') as reader:
    for i, obj in enumerate(reader):
        jobs = obj['titles']

        # remove consecutive duplicates
        jobs = [key for key, _group in groupby(jobs)]
        
        if len(jobs) < 2:
            continue

        normalized_jobs = []
        for job in jobs:
            norm_job = normalize_title(job)
            SYNONYMS[norm_job].add(job)
            normalized_jobs.append(norm_job)

        generate_job_seniority(normalized_jobs, SENIORITY_MATRIX)

        if i % 50000 == 0:
            print(i)
        #print(normalized_jobs)
        #break

In [None]:
with jsonlines.open(extra_data_filepath, 'r') as reader:
    for i, obj in enumerate(reader):
        jobs = obj['titles']

        # remove consecutive duplicates
        jobs = [key for key, _group in groupby(jobs)]
        
        normalized_jobs = []
        for job in jobs:
            norm_job = normalize_title(job)
            if norm_job not in SYNONYMS:
                continue
            # [!] We're not interested in new jobs which don't have non-normalized instances
            SYNONYMS[norm_job].add(job)
            normalized_jobs.append(norm_job)

        if len(normalized_jobs) < 2:
            continue

        generate_job_seniority(normalized_jobs, SENIORITY_MATRIX)

        if i % 50000 == 0:
            print(i)

In [None]:
SALIENCES = defaultdict(lambda: 0)
for k, v in SENIORITY_MATRIX.items():
    s = tuple(sorted(k))
    if k != s:
        continue

    inverse_s = (s[1], s[0])
    if s in SENIORITY_MATRIX:
        SALIENCES[s] += SENIORITY_MATRIX[s]

    if inverse_s in SENIORITY_MATRIX:
        SALIENCES[inverse_s] += SENIORITY_MATRIX[inverse_s]

In [None]:
SALIENCE_LOWER_BOUND = 100

sorted_sal = sorted([(k, v) for k, v in SALIENCES.items() if v > SALIENCE_LOWER_BOUND], key=lambda x: x[1])
sorted_sal = pd.DataFrame(sorted_sal, columns=['combination', 'salience'])
sorted_sal['seniority_level'] = sorted_sal.apply(lambda x: get_seniority(x.combination[0], x.combination[1], SENIORITY_MATRIX), axis=1)

In [None]:
sorted_sal.describe()

In [None]:
sorted_sal[(sorted_sal.salience > 30) & (abs(sorted_sal.seniority_level - 0.5) > 0.2)]

In [None]:
sorted_sal['ordered_combination'] = sorted_sal.apply(
    lambda x: x.combination if x.seniority_level > 0.5 else (x.combination[1], x.combination[0]), axis=1
)

In [None]:
sorted_sal[abs(sorted_sal.seniority_level - 0.5) > 0.2]

In [None]:
out_pairs_path = join(output_path, "seniority_pairs.jsonl")

with jsonlines.open(out_pairs_path, "w") as writer:
    for i, row in sorted_sal[abs(sorted_sal.seniority_level - 0.5) > 0.2].iterrows():
        obj = {
            "junior": list(SYNONYMS[row.ordered_combination[0]]),
            "senior": list(SYNONYMS[row.ordered_combination[1]])
        }

        writer.write(obj)

In [None]:
get_seniority("developer", "enterprise architect", SENIORITY_MATRIX)

In [None]:
synonyms_len = [(k, len(v)) for k, v in SYNONYMS.items() if len(v) != 1]
synonyms_len = sorted(synonyms_len, key=lambda x: x[1])

synonyms_len[-20:]

In [None]:
pd.DataFrame(synonyms_len, columns=['job_title', 'job_count']).describe()