In [1]:
%%html
<style> table {margin-left: 0 !important;} </style>

# Create Labels from Job Description and Resume Datasets
As we do not have a labelled training set marking which resumes gain interviews for particular job descriptions, a heuristic is used to generate the label, which is the similarity of the job title as classified by a pretrained RoBERTa model.

The model is trained by the [GLUE](https://gluebenchmark.com/) task [Semantic Textual Similarity Benchmark (STS-B)](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark), one of the benchmarks used to evaluate the original BERT model. While it is not specific to our task it performs reasonably well to label similarities between job titles.

The goal of this notebook is to create an excel with the format of

Parameter | Description
:-: | :-
title_job | Job Title of the Job Description
text_a* | Job Description
title_res** | Job Title of the Resume Job Experience
text_b* | Description of the Job Experience in a Resume
labels* | Heuristic based on the correlation of the 2 Job Title fields

- Notice:
 - \* fields will be fed into the final RoBERTa model for similarity training
 - ** This is not the content of a full resume, but the job description field of a particular job held by the resume's author currently or in the past - which is considered to have the highest chance to be worth of an interview for a job with similar title.

In [2]:
# simpletransformers only works with this version
!pip install transformers==3.0.2



In [3]:
train_args = {
    'reprocess_input_data': True,
    'overwrite_output_dir': True,
    'evaluate_during_training': True,
    'max_seq_length': 512,
    'num_train_epochs': 10,
    'evaluate_during_training_steps': 50,
    'wandb_project': 'sts-b-medium',
    'train_batch_size': 6,
    'regression': True,
}

In [4]:
# This pre-trained model is to create the labels based on the title - as the heuristic
from simpletransformers.classification import ClassificationModel
model = ClassificationModel('roberta', '../saved_models/outputs-sts-b/', num_labels=1, args=train_args)

In [5]:
# These functions reads in the csv file and prints out sample and summary statistics for the dataset
import pandas as pd
import ast
from functools import partial
def convert_list(x, ast_off=False):
    #print(f"x = {x}")
    if type(x) != str:
        return x
    if len(x) != 0:
        if x[0] in ["{", "["] and x[-1] in ["}", "]"] and not ast_off:
            #print(f"String to be put in AST = {x}")
            try:
                return len(ast.literal_eval(x))
            except SyntaxError:
                #print(x)
                print("1 record corrupted")
                return 0
    return len(x)

def desc_df(path, encoding=None, xls=False, ast_off=False):
    print(path)
    if xls:
        df = pd.read_excel(path)
    else:
        df = pd.read_csv(path) if encoding is None else pd.read_csv(path, encoding)
    for col in df.columns:
        print(f"{col}:\n{df.iloc[0][f'{col}']}\n")
        print(df[f'{col}'].fillna(value="").map(partial(convert_list, ast_off=ast_off)).describe())
        print("\n----------------------------------------------------------------------------------")
    df.info()
    return df

In [6]:
# This is the Job posting dataset for the Armenian Job Center
armenian_job = desc_df("D:\\PycharmProjects\\ISMT-S117-project\\data\\job\\armenian_data job posts.csv")
armenian_job

D:\PycharmProjects\ISMT-S117-project\data\job\armenian_data job posts.csv
jobpost:
AMERIA Investment Consulting Company
JOB TITLE:  Chief Financial Officer
POSITION LOCATION: Yerevan, Armenia
JOB DESCRIPTION:   AMERIA Investment Consulting Company is seeking a
Chief Financial Officer. This position manages the company's fiscal and
administrative functions, provides highly responsible and technically
complex staff assistance to the Executive Director. The work performed
requires a high level of technical proficiency in financial management
and investment management, as well as management, supervisory, and
administrative skills.
JOB RESPONSIBILITIES:  
- Supervises financial management and administrative staff, including
assigning responsibilities, reviewing employees' work processes and
products, counseling employees, giving performance evaluations, and
recommending disciplinary action;
- Serves as member of management team participating in both strategic
and operational planning for th

count    19001.000000
mean       553.863376
std        366.413854
min          0.000000
25%        318.000000
50%        482.000000
75%        706.000000
max       5026.000000
Name: RequiredQual, dtype: float64

----------------------------------------------------------------------------------
Salary:
nan

count    19001.000000
mean        26.457081
std         42.183331
min          0.000000
25%          0.000000
50%          4.000000
75%         39.000000
max        393.000000
Name: Salary, dtype: float64

----------------------------------------------------------------------------------
ApplicationP:
To apply for this position, please submit a
cover letter and a resume addressing relevant qualifications and
experience and information on professional reference strictly to Tatevik
Hovhannisyan; Executive Assistant: fax: 374-1-546800 or e-mail:ameria@.... Tel: 374 (1) 524040; 524140. Only shortlisted
candidates will be notified for an interview.
Please clearly mention in your applicati

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,,Full-time,,,,Long-term,...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,,As a company Technolinguistics has a mandate t...,,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,,,,,,Long-term,...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,,San Lazzaro LLC works with several internation...,,2015,12,False


In [7]:
# This is the Resume dataset from Indian IT candidates
avani_resume = desc_df("../data/resumes/avani_resume_data.csv")
avani_resume

../data/resumes/avani_resume_data.csv
Resume_title:
Java Developer

count    14804.000000
mean        23.296406
std         47.976468
min          0.000000
25%          4.000000
50%          4.000000
75%         25.000000
max       1021.000000
Name: Resume_title, dtype: float64

----------------------------------------------------------------------------------
City:
Tirur

count    14804.000000
mean         7.044312
std          6.867734
min          0.000000
25%          5.000000
50%          7.000000
75%          9.000000
max        781.000000
Name: City, dtype: float64

----------------------------------------------------------------------------------
State:
 Kerala

count    14804.000000
mean         9.760065
std          3.103059
min          0.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         23.000000
Name: State, dtype: float64

----------------------------------------------------------------------------------
Description:
To prove myself dedi

Unnamed: 0,Resume_title,City,State,Description,work_experiences,Educations,Skills,Links,Certificates,Additional Information
0,Java Developer,Tirur,Kerala,"To prove myself dedicated, worthy and energeti...","{0: [{'wtitle:': 'Java Developer'}, {'wcompany...","{0: [{'e_title:': ""Bachelor's in Bachelor of C...","['Java (Less than 1 year)', ' Jsp (Less than 1...",['https://www.linkedin.com/in/mohamed-rihan-k-...,"{0: [{'c_title:': 'Java Developer'}, {'c_durat...",\nTechnical Expertise \n• Operating Systems: W...
1,Software Developer,Bengaluru,Karnataka,Working as Software Developer at IngroInfo Sof...,"{0: [{'wtitle:': 'JAVA DEVELOPER'}, {'wcompany...",{0: [{'e_title:': 'MCA in Master of Computer A...,"['Programming Languages: Core Java', ' J2EE \n...","['http://github.com/NK-PATEL/Train_Project', '...",{},NONE
2,Java developer,Pune,Maharashtra,Looking for a challenging career which demands...,"{0: [{'wtitle:': 'Java Developer'}, {'wcompany...","{0: [{'e_title:': ""Bachelor's in Electrical En...","['ECLIPSE (1 year)', ' HIBERNATE', ' SPRING (L...",[],{},"\nTECHNICAL SKILLS \n \nFrameworks: Spring, Sp..."
3,Seeking innovative and challenging career assi...,Pune,Maharashtra,NONE,"{0: [{'wtitle:': 'Java Developer'}, {'wcompany...","{0: [{'e_title:': 'BE in Computer'}, {'e_schoo...","['GIT', ' Angular 7', ' MAVEN', ' Java', ' Jen...",[],{},NONE
4,NONE,Pune,Maharashtra,NONE,"{0: [{'wtitle:': 'Java Developer'}, {'wcompany...",{0: [{'e_title:': 'Bachelor of Engineering in ...,['Project: HR Payroll Systems Role: Java Devel...,[],{},NONE
...,...,...,...,...,...,...,...,...,...,...
14799,Back Office Operations & HR Coordinator,Calicut,Kerala,Management Professional seeking a full-time po...,{0: [{'wtitle:': 'Project Administration & Pay...,"{0: [{'e_title:': 'MBA in MARKETING & HR'}, {'...","['MS OFFICE (9 years)', ' ACONEX PROJECT MANAG...",['http://www.linkedin.com/in/akhil-ravi-1512b6...,{0: [{'c_title:': 'ISO 9001:2018 QMS INTERNAL ...,\nISO 9001:2015 QMS LLYOD REGISTER CERTIFIED \...
14800,NONE,NONE,NONE,NONE,"{0: [{'wtitle:': 'Accountant'}, {'wcompany:': ...","{0: [{'e_title:': 'B.COM'}, {'e_schoolname:': ...",['NONE'],[],{},\nCOMPUTER PROFICIENCY \n \nApplications \nAcc...
14801,NONE,Durgapur,West Bengal,To be potential resource to the organization w...,"{0: [{'wtitle:': 'Administrative works'}, {'wc...","{0: [{'e_title:': 'CERTIFICATE'}, {'e_schoolna...","['ACCOUNTING', ' ERP', ' Tally', ' PAYROLL', '...",[],{},\nYEAR OF \nDEGREE/CERTIFICATE INSTITUTION % O...
14802,NONE,NONE,NONE,A finance professional with close to 6 years o...,"{0: [{'wtitle:': 'Assistant Manger'}, {'wcompa...",{0: [{'e_title:': 'Bachelor of Commerce in Com...,"['ERP (2 years)', ' Tally (5 years)', ' MS OFF...",[],{},\nStrengths and Core Competencies \n \n• Knowl...


In [8]:
# As the Avani Resume dataset as fields like work_experiences which are fields extracted from the
# original resumes and stored as dictionary in those fields, the follow procedure extract the job
# title and job description from the dictionary and store in a seperate dataframe called wdescr_df
# The resultant job title will follow the content in the field, instead of avani_resume.Resume_title

DEBUG = False
wdescr_df = pd.DataFrame(columns=["res_no", "exp_no", "wtitle", "wdescr"])
for i, job in enumerate(avani_resume.work_experiences.fillna(value="{}")):
    if DEBUG:
        print(f"Resume {i}:")
    try:
        for key, value in ast.literal_eval(job).items():
            paramlist = ["wtitle", "wduration", "wdescr"]
            for param in paramlist:
                locals()[param] = ""
            for field in value:
                for param in paramlist:
                    locals()[param] = field[param+":"] if param+":" in field.keys() else locals()[param]
            if DEBUG:
                print(f"  Exp {key} {wtitle}, Duration {wduration}", "\n")
                print(wdescr, "\n")
            wdescr_df = wdescr_df.append({"res_no":i, "exp_no":key, "wtitle":wtitle, "wdescr":wdescr}, ignore_index=True)
    except SyntaxError:
        print("  1 Record corrupted - discarded")
print(wdescr_df.wdescr.map(len).describe())
print(wdescr_df.wtitle.value_counts())
wdescr_df

  1 Record corrupted - discarded
  1 Record corrupted - discarded
  1 Record corrupted - discarded
  1 Record corrupted - discarded
count    35536.000000
mean       609.196561
std        946.347108
min          1.000000
25%         33.000000
50%        277.000000
75%        790.000000
max      18977.000000
Name: wdescr, dtype: float64
Developer                     1966
NONE                          1926
Software Developer            1796
PHP Developer                 1317
Java Developer                 993
                              ... 
ETL Developer I                  1
Senior Technical Writer          1
ACTIVE CONTRIBUTOR               1
Axxcelera - Plan Editor          1
Data Developer-Team Member       1
Name: wtitle, Length: 8514, dtype: int64


Unnamed: 0,res_no,exp_no,wtitle,wdescr
0,0,0,Java Developer,NONE
1,1,0,JAVA DEVELOPER,Working as Software Developer at IngroInfo Sof...
2,2,0,Java Developer,NONE
3,3,0,Java Developer,• Working as a Java Developer at Atos Syntel. ...
4,4,0,Java Developer,• 2+ years' of experience in application of pa...
...,...,...,...,...
35531,14802,2,Junior Accountant,• Updating Daily Banking Transactions in Sun S...
35532,14802,3,Accountant,• Posting Accounting Entries in Tally and Proc...
35533,14802,4,TAX CONSULTANT,- Asst.Staff \n \n• Preparation of Books of Ac...
35534,14803,0,Commercial Officer,IT Skills \n \nSkill Version


In [9]:
# After the generation above - wdescr_df is manaully annotated with the column "Type" that identifies IT related resumes
wdescr_df = pd.read_excel("../data/resumes/avani_resume_data_annotated.xlsx")
wdescr_df

Unnamed: 0.1,Unnamed: 0,res_no,exp_no,wtitle,wdescr,Trash,Type,Word Count
0,0,0,0,Java Developer,NONE,Y,,
1,1,1,0,JAVA DEVELOPER,Working as Software Developer at IngroInfo Sof...,,Dev,49.0
2,2,2,0,Java Developer,NONE,Y,,
3,3,3,0,Java Developer,• Working as a Java Developer at Atos Syntel. ...,,Dev,34.0
4,4,4,0,Java Developer,• 2+ years' of experience in application of pa...,,Dev,21.0
...,...,...,...,...,...,...,...,...
35531,35531,14802,2,Junior Accountant,• Updating Daily Banking Transactions in Sun S...,Accountant,,
35532,35532,14802,3,Accountant,• Posting Accounting Entries in Tally and Proc...,Accountant,,
35533,35533,14802,4,TAX CONSULTANT,- Asst.Staff \n \n• Preparation of Books of Ac...,tax,,
35534,35534,14803,0,Commercial Officer,IT Skills \n \nSkill Version,,,5.0


In [10]:
# We first build a table using the matching of *Job Titles* and then use  the RoBERTa pretrained model to
# calculate the heuristic labels. For the # benefit of time in demo sampling is used.
# There are many null or uselessly short fields in the resume job history. Removing those with < 20 tokens
# seems to give a good result
import pandas as pd
pd.reset_option("display.max_rows")
sample_size = 55
job_text = [ [jt, rt]
            for jt in pd.Series(armenian_job[armenian_job.IT].Title.str.lower().unique()).sample(sample_size, random_state=42).tolist()
            for rt in pd.Series(wdescr_df[~((wdescr_df.Type.isna()) | (wdescr_df["Word Count"] < 20))].wtitle.str.lower().unique()).sample(sample_size, random_state=42).tolist()]
len(job_text)

3025

In [11]:
job_pred, job_raw_outputs = model.predict(job_text)
job_pred

HBox(children=(FloatProgress(value=0.0, max=3025.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=379.0), HTML(value='')))




array([0.34648874, 0.6871556 , 0.34885538, ..., 0.2712175 , 0.4444028 ,
       0.37783605], dtype=float32)

In [12]:
# The job_df are pairs of job titles scored and will be used to build the actual training set
# composed of the *job descriptions* and score
pd.reset_option("display.max_rows")
#pd.options.display.max_rows = None
job_df = pd.DataFrame(job_text, columns=["text_a", "text_b"])

print(len(job_pred))
print(len(job_text))

#del job_text

job_df["pred"] = job_pred

#del job_pred

print(job_df.pred.describe())
job_df = job_df.sort_values("pred", ascending=False)
job_df

3025
3025
count    3025.000000
mean        0.921197
std         0.768124
min         0.002934
25%         0.328776
50%         0.578183
75%         1.411603
max         4.319074
Name: pred, dtype: float64


Unnamed: 0,text_a,text_b,pred
930,c/ c++ software engineer,"c,c++ developer",4.319074
2870,java/j2ee developer,sr. java / j2ee developer,4.280726
885,c/ c++ software engineer,c++ developer,4.108096
573,senior software developer/ technical leader,"technical lead, developer",4.100830
655,"senior c++, c# developer","c,c++ developer",4.038229
...,...,...,...
1530,micro lending division credit specialist,office executive and web developer,0.011834
1487,micro lending division credit specialist,associate html programmer - software development,0.011406
1491,micro lending division credit specialist,chennai as senior software engineer/ developer,0.008175
1533,micro lending division credit specialist,computer applications teacher cum website deve...,0.004590


In [13]:
pd.cut(job_df.pred, bins=[-1,1,2,3,4,6], right=True, labels=False).value_counts()

0    1892
1     778
2     322
3      28
4       5
Name: pred, dtype: int64

In [14]:
#-------------- using absolute thresholds ------------------
#pos_thres = 3.5
#neg_thres = 0.25
#pos_cor = job_df[job_df.pred >= pos_thres]
#neg_cor = job_df[job_df.pred <= neg_thres]
#all_cor = job_df[(job_df.pred >= pos_thres) | (job_df.pred <= neg_thres)]
#-------------- highest/lowers 2.5k values------------------
#pos_cor = job_df.iloc[0:round(job_df.shape[0]/4)]
#neg_cor = job_df.iloc[round(job_df.shape[0]/4)*3:job_df.shape[0]]

# As there are fewer highly scored samples, take top 10%, then select 10% from the rest of the samples
sample_size = round(job_df.shape[0]/10)
pos_cor = job_df.iloc[0:sample_size]
neg_cor = job_df.iloc[sample_size:job_df.shape[0]].sample(sample_size, random_state=42)
all_cor = pos_cor.append(neg_cor)
pos_cor.shape, neg_cor.shape, all_cor.shape

((302, 3), (302, 3), (604, 3))

In [15]:
all_cor

Unnamed: 0,text_a,text_b,pred
930,c/ c++ software engineer,"c,c++ developer",4.319074
2870,java/j2ee developer,sr. java / j2ee developer,4.280726
885,c/ c++ software engineer,c++ developer,4.108096
573,senior software developer/ technical leader,"technical lead, developer",4.100830
655,"senior c++, c# developer","c,c++ developer",4.038229
...,...,...,...
2584,java intern,associate software developer,0.387010
201,flash/ as3 developer,developer & solution analyst,1.575380
1923,graphic designer / customer service operator,senior developer/ business analyst,1.577018
2088,unix systems administrator,senior developer/ business analyst,0.273131


In [16]:
# new training data score distribution
print(all_cor.pred.describe())
pd.cut(all_cor.pred, bins=[-1,1,2,3,4,6], right=True, labels=False).value_counts()

count    604.000000
mean       1.664684
std        1.010826
min        0.004590
25%        0.548558
50%        2.076045
75%        2.421403
max        4.319074
Name: pred, dtype: float64


2    274
0    201
1     96
3     28
4      5
Name: pred, dtype: int64

In [17]:
# The Armenian Job Dataset has standard format with key words preceeding it. Many of those sections like
# application procedures and company descriptions are not relevant to candidate selections. This extracts
# the 3 sections including Job Description, Job Responsibilities and Qualifications
pd.options.display.max_columns = None
from sklearn.feature_extraction.text import CountVectorizer
tok = CountVectorizer().build_tokenizer()

DEBUG = False
import re
section_df = pd.DataFrame(columns=["TITLE", "DESCRIPTION", "RESPONSIBILITIES", "QUALIFICATIONS"])
section_tags = "JOB TITLE POSITION LOCATION DESCRIPTION RESPONSIBILITIES REQUIRED QUALIFICATIONS APPLICATION PROCEDURES DEADLINE DURATION START DATE ABOUT COMPANY REMUNERATION WORK HOURS SELECTION PROCESS ADDITIONAL CRITERIA POINT OF CONTACT DEFINITIONS PREFERRED COURSE OVERVIEW".split()

#for row in armenian_job[armenian_job.IT].jobpost:
for _, row in armenian_job[armenian_job.IT].iterrows():
    cur_col = ""
    #row_df = pd.DataFrame({"TITLE":[[]], "DESCRIPTION":[[]], "RESPONSIBILITIES":[[]], "QUALIFICATIONS":[[]]})
    row_df = pd.DataFrame({"TITLE":row.Title, "DESCRIPTION":[[]], "RESPONSIBILITIES":[[]], "QUALIFICATIONS":[[]]})
    for i in row.jobpost.split(sep=":"):
        if DEBUG:
            print("-------------------------------------------------------------------------------------")
        #for my_tok in rep_model.tokenizer.decode(rep_model.tokenizer.encode(i)).split():
        for my_tok in tok(i):
            if DEBUG:
                print(my_tok)
            switched = False
            if my_tok not in section_tags:
                if not (re.match("^\[.*\]$", my_tok) or cur_col == ""):
                    # row_df.iloc[0][cur_col].append(re.sub("\[CLS\]", "", my_tok)) # no need as Title is not taken
                    row_df.iloc[0][cur_col].append(my_tok)
                    if DEBUG:
                        print(f"{my_tok} Added to {cur_col}")
                        print(row_df)
            else:
                #switched, cur_col = (True, my_tok) if my_tok == "TITLE" else (switched, cur_col)
                switched, cur_col = (True, my_tok) if my_tok == "DESCRIPTION" else (switched, cur_col)
                switched, cur_col = (True, my_tok) if my_tok == "RESPONSIBILITIES" else (switched, cur_col)
                switched, cur_col = (True, my_tok) if my_tok == "QUALIFICATIONS" else (switched, cur_col)
                if not switched:
                    cur_col = ""
                if DEBUG:
                    print(f"Current list is {cur_col}")
    section_df = section_df.append(row_df, ignore_index=True)
section_df["RESUME"] = section_df.apply(lambda x: " ".join(x.DESCRIPTION) + " " + " ".join(x.RESPONSIBILITIES) + " " + " ".join(x.QUALIFICATIONS), axis=1)
section_df

Unnamed: 0,TITLE,DESCRIPTION,RESPONSIBILITIES,QUALIFICATIONS,RESUME
0,Software Developer,[],"[Rendering, technical, assistance, to, Databas...","[University, degree, economical, background, i...",Rendering technical assistance to Database Ma...
1,Network Administrator,[],"[Network, monitoring, and, administration, Dat...","[Excellent, knowledge, of, Windows, 2000, Serv...",Network monitoring and administration Databas...
2,Graphic Designer,"[The, position, of, Graphic, Designer, GD, dem...","[Graphic, Designer, will, be, responsible, for...","[As, GD, you, are, creative, innovative, and, ...",The position of Graphic Designer GD demands pr...
3,Demographic Analysis Workshop,"[Demographic, analysis, and, population, proje...",[],"[Participants, should, be, mid, level, profess...",Demographic analysis and population projection...
4,Programmer,[],[],"[Work, experience, of, at, least, two, years, ...",Work experience of at least two years Knowle...
...,...,...,...,...,...
3754,Junior Mobile Visual UI/ UX Designer,"[PicsArt, is, looking, for, creative, Junior, ...","[Create, visual, language, for, PicsArt, user,...","[Experience, in, shipping, beautifully, design...",PicsArt is looking for creative Junior Mobile ...
3755,JavaScript Professional,"[Berg, Development, is, looking, for, JavaScri...","[Re, develop, and, further, the, development, ...","[Expert, knowledge, of, JavaScript, especially...",Berg Development is looking for JavaScript Pro...
3756,Java Developer,"[NASDAQ, OMX, Armenia, is, seeking, an, energe...","[Responsible, for, developing, and, building, ...","[Master, degree, in, IT, Engineering, or, rela...",NASDAQ OMX Armenia is seeking an energetic Jav...
3757,C/ C++ Developer,"[NASDAQ, OMX, Armenia, is, seeking, an, energe...",[],"[Master, degree, in, IT, Engineering, or, rela...",NASDAQ OMX Armenia is seeking an energetic Dev...


In [18]:
# This uses the section_df from jobs and wdescr_df from resumes as lookup, and perform a table join
# on all the job_title_from_job:job_title_from_resume pairs in all_cor table, and score then with
# the score as in the same row.
# text_a, text_b and the labels will eventually be used in the RoBERTa sentence pair classification
# where the other columns like job titles are ignored.
pd.reset_option("display.max_rows")
all_df = pd.DataFrame(columns=["title_job", "text_a", "title_res", "text_b", "labels"])
DEBUG = False
for _, row in all_cor.iterrows():
    job_desc = pd.Series(section_df[section_df.TITLE.str.lower() == row.text_a].RESUME.unique())
    res_desc = pd.Series(wdescr_df[~((wdescr_df.Type.isna()) | (wdescr_df["Word Count"] < 20)) &
                                   (wdescr_df.wtitle.str.lower() == row.text_b)].wdescr.unique())
    if DEBUG:
        print(f"text_a: {row.text_a}, job_desc: {job_desc.shape}")
        print(f"text_b: {row.text_b}, res_desc: {res_desc.shape}")
        print("------------------------------------------------------------------------------")
    for job in job_desc:
        for res in res_desc:
            all_df = all_df.append({"title_job":row.text_a, "text_a":job, "title_res":row.text_b, "text_b":res, "labels":row.pred}, ignore_index=True)
all_df

Unnamed: 0,title_job,text_a,title_res,text_b,labels
0,c/ c++ software engineer,LTX Credence Armenia LLC is looking for Softwa...,"c,c++ developer","Languages Known: C, C++, Data Structures, \nJa...",4.319074
1,c/ c++ software engineer,LTX Credence Armenia LLC is looking for Softwa...,"c,c++ developer","Languages Known: C, C++, Data Structures, \nJa...",4.319074
2,c/ c++ software engineer,LTX Credence Armenia LLC is looking for Softwa...,"c,c++ developer","Languages Known: C, C++, Data Structures, \nJa...",4.319074
3,c/ c++ software engineer,Dom Daniel Armenia is looking for dynamic self...,"c,c++ developer","Languages Known: C, C++, Data Structures, \nJa...",4.319074
4,c/ c++ software engineer,LTX Credence Armenia LLC is looking for Softwa...,"c,c++ developer","Languages Known: C, C++, Data Structures, \nJa...",4.319074
...,...,...,...,...,...
7613,unix systems administrator,Administration of Corporate Unix servers Plan...,senior developer/ business analyst,12-Sep-2012 Present Senior Developer/ Business...,0.273131
7614,unix systems administrator,Administration of corporate Unix Solaris Linu...,senior developer/ business analyst,12-Sep-2012 Present Senior Developer/ Business...,0.273131
7615,unix systems administrator,ArmenTel is seeking for candidates to fulfill ...,senior developer/ business analyst,12-Sep-2012 Present Senior Developer/ Business...,0.273131
7616,unix systems administrator,ArmenTel is looking for candidates to fulfill ...,senior developer/ business analyst,12-Sep-2012 Present Senior Developer/ Business...,0.273131


In [19]:
# This is the actual # of the full set of samples
print(all_df.labels.describe())
pd.cut(all_df.labels, bins=[-1,1,2,3,4,6], right=True, labels=False).value_counts()

count    7618.000000
mean        2.162762
std         0.858653
min         0.004590
25%         2.144637
50%         2.387411
75%         2.644098
max         4.319074
Name: labels, dtype: float64


2    5143
0    1251
3     692
1     482
4      50
Name: labels, dtype: int64

In [20]:
# split the corpuse into training and evaluation set
import numpy as np
np.random.seed(seed=42)
pct_train = 0.5
train_bool = np.random.random(len(all_df))<=pct_train
train_df = all_df[train_bool]
eval_df = all_df[~train_bool]
train_bool = np.random.random(len(eval_df))<=pct_train
test_df = eval_df[~train_bool]
eval_df = eval_df[train_bool]
train_df.shape, eval_df.shape, test_df.shape

((3844, 5), (1914, 5), (1860, 5))

In [21]:
# display record token length distribution to determine RoBERTa sequence length
from sklearn.feature_extraction.text import CountVectorizer

tok = CountVectorizer().build_tokenizer()
train_df.text_a.map(tok).map(len).describe(), eval_df.text_b.map(tok).map(len).describe(), test_df.text_b.map(tok).map(len).describe()

(count    3844.000000
 mean      143.517950
 std        63.985296
 min        14.000000
 25%       101.000000
 50%       134.000000
 75%       174.500000
 max       457.000000
 Name: text_a, dtype: float64,
 count    1914.000000
 mean      122.719958
 std       113.179210
 min        20.000000
 25%        45.000000
 50%        91.000000
 75%       142.000000
 max       594.000000
 Name: text_b, dtype: float64,
 count    1860.000000
 mean      120.052151
 std       106.285360
 min        20.000000
 25%        45.000000
 50%        88.000000
 75%       142.000000
 max       594.000000
 Name: text_b, dtype: float64)

In [22]:
suffix = "_submit"
train_df.to_excel(f"../data/train_df{suffix}.xlsx")
eval_df.to_excel(f"../data/eval_df{suffix}.xlsx")
test_df.to_excel(f"../data/test_df{suffix}.xlsx")