
---
# Load needed libraries and functions

In [1]:

%run ../../load_magic/storage.py
%run ../../load_magic/paths.py
%run ../../load_magic/lists.py
%run ../../load_magic/environment.py
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
%pprint

notebook_path = get_notebook_path()
print(notebook_path)

s = Storage()
print(['s.{}'.format(fn) for fn in dir(s) if not fn.startswith('_')])
hunting_df = s.load_object('hunting_df')
basic_quals_dict = s.load_object('basic_quals_dict')
dir()

Pretty printing has been turned OFF
D:\Documents\Repositories\notebooks\Miscellaneous\ipynb\Job Hunting.ipynb
['s.attempt_to_pickle', 's.data_csv_folder', 's.data_folder', 's.encoding_type', 's.load_csv', 's.load_dataframes', 's.load_object', 's.save_dataframes', 's.saves_csv_folder', 's.saves_folder', 's.saves_pickle_folder', 's.store_objects']


['Config', 'CountVectorizer', 'In', 'Out', 'RandomForestClassifier', 'SequenceMatcher', 'Storage', 'TfidfTransformer', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__nonzero__', '__package__', '__spec__', '_dh', '_i', '_i1', '_ih', '_ii', '_iii', '_oh', 'basic_quals_dict', 'check_4_doubles', 'check_for_typos', 'conjunctify_list', 'copyfile', 'csv', 'encoding', 'exit', 'get_classifier', 'get_data_structs_dataframe', 'get_datastructure_prediction', 'get_dir_tree', 'get_git_lfs_track_commands', 'get_importances', 'get_input_sample', 'get_ipython', 'get_module_version', 'get_notebook_path', 'get_specific_gitignore_files', 'get_struct_name', 'humanize_bytes', 'hunting_df', 'ipykernel', 'json', 'jupyter_config_dir', 'notebook_path', 'notebookapp', 'np', 'os', 'pd', 'pickle', 'preprocess_data', 'print_all_files_ending_starting_with', 'print_all_files_ending_with', 'print_all_files_starting_with', 'quit', 're', 'remove_empty_folders', 's', 'similar', '


---
# Needed extra functions

In [2]:

# Email prep
subject_str = '{}% fit: Internal Candidate, Dave Babbitt, for {}'
concerns_str = 'One important question I have is if the work can be supported remotely or if this position is available for remote delivery '
concerns_str += '(or something equivalent).'
concerns_str += " I don't want to move my family out of New England."
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
emails_dir = os.path.join(s.saves_folder, 'emails')
os.makedirs(name=emails_dir, exist_ok=True)
name_regex = re.compile(r'^([^(]+) \((\d+)\)')
def clean_email(email_str):
    match_obj = name_regex.search(email_str)
    if match_obj:
        email_str = match_obj.group(1).strip()
        employee_id = match_obj.group(2).strip()
        names_list = re.split(r'\s+', email_str, 0)
        if len(names_list) >= 2:
            first_name = names_list[0]
            last_name = names_list[1]
            email_str = '{}, {} [USA] <{}@bah.com>'.format(last_name, first_name, employee_id)
    
    return email_str

In [3]:

def get_percent_fit(row_series):
    percent_fit = row_series['percent_fit']
    if str(percent_fit) == 'nan':
        percent_fit = 0
    percent_fit = int(percent_fit*100)
    
    return percent_fit

In [4]:

def print_loc_computation(row_index, quals_list, basic_quals_dict):
    print()
    numerator_str_list = []
    for qual_str in quals_list:
        if qual_str in basic_quals_dict:
            numerator_str_list.append(str(basic_quals_dict[qual_str]))
        else:
            numerator_str_list.append('000')
    numerator_str = '+'.join(numerator_str_list)
    print("hunting_df.loc[{}, 'percent_fit'] = ({})/{}".format(row_index, numerator_str, len(quals_list)))

In [5]:

def get_predictions_and_counts(prediction_list, quals_list):
    qual_count = 0
    prediction_str = ''
    for pred_array, qual_str in zip(prediction_list, quals_list):
        prediction = pred_array[1]
        prediction_str += '\n{} {}'.format(prediction, qual_str)
        if prediction > 0.5:
            qual_count += 1
    
    return prediction_str, qual_count

In [6]:

def get_quals_str(prediction_list, quals_list, basic_quals_dict):
    qual_count = 0
    quals_str = ''
    for pred_array, (i, qual_str) in zip(prediction_list, enumerate(quals_list)):
        if qual_str in basic_quals_dict:
            formatted_str = '\nquals_list[{}] = "{}" ({})'
        else:
            formatted_str = '\n*quals_list[{}] = "{}" ({})'
        prediction = pred_array[1]
        quals_str += formatted_str.format(i, qual_str, prediction)
        if prediction > 0.5:
            qual_count += 1
    
    return quals_str, qual_count

In [7]:

def print_fit_job(row_index, row_series, basic_quals_dict):
    job_fitness = 0.0
    job_description = row_series['Job Description']
    quals_list = get_quals_list(job_description)
    if len(quals_list):
        prediction_list = list(predict_percent_fit(quals_list))
        #prediction_str, qual_count = get_predictions_and_counts(prediction_list, quals_list)
        quals_str, qual_count = get_quals_str(prediction_list, quals_list, basic_quals_dict)
        job_fitness = qual_count/len(prediction_list)
        if job_fitness > 0.8:
            print('Basic Qualifications:{}'.format(quals_str))
            #print(prediction_str)
            print(job_fitness)
            print_loc_computation(row_index, quals_list, basic_quals_dict)
    
    return quals_list, job_fitness

In [8]:

def qual_sum(qual_str):
    results = '"{}"'.format(qual_str)
    if qual_str in basic_quals_dict:
        results = basic_quals_dict[qual_str]
    else:
        results = predict_percent_fit([qual_str])[0][1]
        if results > 0.5:
            results = 1.0
        else:
            results = 0.0
    
    return str(results)

In [10]:

scanner_regex = re.compile(r'\b[1-9a-zA-Z][0-9a-zA-Z]*( *[#\+]{1,2}|\b)')
def regex_tokenizer(corpus):
    
    return [match.group() for match in re.finditer(scanner_regex, corpus)]

In [11]:

import string

printable_regex = re.compile('[^{}]+'.format(string.printable))
def un_msword_ify(x):
    msword_str = str(x)
    msword_str = printable_regex.sub(r' ', msword_str).strip()
    msword_str = re.sub(r'[^\x00-\x7f]+', r' ', msword_str).strip()
    msword_str = re.sub(r' +', ' ', msword_str)
    msword_str = re.sub(r'::', ':', msword_str)
    msword_str = re.sub(r':$', '', msword_str)
    msword_str = re.sub(r'^-', '', msword_str)
    
    return msword_str

In [12]:

a_list = ['Additional Qualifications?', 'Nice If You Have', 'Nice if you have', 'Nice if You Have',
          'Additional Preferred Qualifications', 'Nice if you Have', 'Additional qualifications', 'Nice to Have']
a_str = '({}):?'.format('|'.join(a_list))
def get_quals_list(job_description):
    job_description = un_msword_ify(job_description)
    basic_quals = ''
    quals_list = []
    items_list = re.split('(Key Role|The Challenge):', job_description, 0)
    if len(items_list) > 1:
        job_description = items_list[-1].strip()
    items_list = re.split('[\r\n]+(Basic Qualifications?|You Have|You have):?', job_description, 0)
    if len(items_list) > 1:
        job_description = items_list[-1].strip()
    items_list = re.split(a_str, job_description, 0)
    if len(items_list) > 1:
        basic_quals = items_list[0].strip()
    else:
        items_list = re.split('(Clearance|Build Your Career):', job_description, 0)
        basic_quals = items_list[0].strip()
    if basic_quals != '':
        quals_list = [un_msword_ify(q) for q in re.split('[\r\n]+', basic_quals, 0)]
        quals_list = [x for x in quals_list if x != '']
    quals_list = [x.strip() for x in quals_list if x.strip() != '']
    
    return quals_list

In [13]:

def get_email_tuple(row_series):
    hm_str = row_series['Hiring Manager']
    hm_email = clean_email(hm_str)
    pr_str = row_series['Primary Recruiter']
    pr_email = clean_email(pr_str)
    email_tuple = (hm_email, pr_email)
    notspammed_list = []
    last_emailed_dict = s.load_object('last_emailed_dict')
    for i in range(2):
        email = email_tuple[i]
        if email in last_emailed_dict:
            date_obj = datetime.now()
            days_diff = (pd.Timestamp(date_obj) - last_emailed_dict[email]).days
            if days_diff > 14:
                notspammed_list.append(email)
        else:
            notspammed_list.append(email)
    
    return tuple(notspammed_list)

In [14]:

def print_emails(match_series):
    for file_name in os.listdir(emails_dir):
        if file_name.endswith('.txt'):
            file_path = os.path.join(emails_dir, file_name)
            os.remove(file_path)
    email_tuples_list = []
    for row_index, row_series in hunting_df[match_series].sort_values('percent_fit', ascending=False).iterrows():
        req_str = row_series['Job Requisition']
        percent_fit = get_percent_fit(row_series)
        sents_list = ['I’m submitting my resume for {}.'.format(req_str),
                      'I’ve reviewed the basic qualifications and believe I’m a good fit for this project.',
                      'Below is a breakdown of the requirements and the amount of experience I have with each.',
                      'I’m available at your convenience to discuss my qualifications and look forward to hearing from you.']
        blurb_str = ' '.join(sents_list)
        job_description = row_series['Job Description']
        quals_list = get_quals_list(job_description)
        quals_str = '\n•\t' + '\n•\t'.join(quals_list)
        file_path = os.path.join(emails_dir, '{}_email.txt'.format(row_series['Job Requisition ID'].strip()))
        email_tuple = get_email_tuple(row_series)
        if len(email_tuple):
            email_tuples_list.append(email_tuple)
            if not os.path.isfile(file_path):
                with open(file_path, 'w', encoding=s.encoding_type) as io_wrapper:
                    print('', file=io_wrapper)
                    print('To: {}'.format('; '.join(list(email_tuple))), file=io_wrapper)
                    print('CC: Safi, Claudia [USA] <safi_claudia@bah.com>; Borrelli, Bill [USA] <Borrelli_Bill@bah.com>', file=io_wrapper)
                    print(subject_str.format(percent_fit, req_str), file=io_wrapper)
                    print('', file=io_wrapper)
                    print('Dear {},'.format(row_series['Hiring Manager'].split(' ')[0]), file=io_wrapper)
                    print('', file=io_wrapper)
                    print('{}'.format(blurb_str), file=io_wrapper)
                    print('', file=io_wrapper)
                    print('Basic Qualifications:{}'.format(quals_str), file=io_wrapper)
                    print('', file=io_wrapper)
                    print(concerns_str, file=io_wrapper)
                    print('', file=io_wrapper)
                    print('Attached: Dave_Babbitt_Resume_for_{}.pdf'.format('_'.join(re.split(r'[ \\\/:\*\?"><\|]+', req_str, 0))),
                          file=io_wrapper)
                !"{text_editor_path}" "{os.path.abspath(file_path)}"
    !start %windir%\explorer.exe "{os.path.abspath(emails_dir)}"
    
    return email_tuples_list

In [15]:

# Add new ORR to the hunting dataframe
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
jd_cn = 'Job Description'
reqid_cn = 'Job Requisition ID'
dupe_columns_list = [reqid_cn]
import_columns_list = ['Job Posting', 'Job Requisition', reqid_cn, 'Job Requisition Status', 'Furthest Stage',
                       'Supervisory Organization', 'Group', 'Account Group', 'IMT', 'Cluster', 'Primary Recruiter',
                       'Resource Manager', 'Hiring Manager', 'Job Posting Title', 'Job Profile', 'Job Requisition Type',
                       'Management Level', 'Primary Location', 'Primary Location State/Province', 'Primary Location Country',
                       'Required Clearance', 'Clearance Agency', 'Time Type', 'Recruiting Start Date', 'Job Type', 'Job Family',
                       'Business Title', 'Job Family Group', jd_cn, 'Job Profile Skills']
hunting_dir = r'D:\Documents\Administrivia\Job Hunting\csv'
def add_new_orr(file_name, hunting_df):
    file_path = os.path.join(hunting_dir, file_name)
    if os.path.isfile(file_path):
        df = pd.read_csv(file_path, header=0, skiprows=0, encoding='iso8859-1')
        df.columns = import_columns_list
        req_id_list = hunting_df[reqid_cn].unique().tolist()
        match_series = (df[reqid_cn].isin(req_id_list))
        hunting_df = pd.concat([hunting_df, df[~match_series]]).fillna({'is_opportunity_application_emailed': False})
        hunting_df[jd_cn] = hunting_df[jd_cn].map(un_msword_ify)
        hunting_df = hunting_df.drop_duplicates(subset=dupe_columns_list, ignore_index=True)
        hunting_df.reset_index(drop=True, inplace=True)
        
        # Clean up missing job family column
        match_series = hunting_df['Job Family Group'].map(lambda x: str(x).strip().startswith('Key Role'))
        match_series = match_series | hunting_df['Job Family Group'].map(lambda x: str(x).strip().startswith('The Challenge'))
        for row_index, row_series in hunting_df[match_series].iterrows():
            job_description = str(row_series['Job Family Group']).strip()
            job_family_group = str(row_series['Business Title']).strip()
            business_title = str(row_series['Job Family']).strip()
            hunting_df.loc[row_index, 'Job Description'] = job_description
            hunting_df.loc[row_index, 'Job Family Group'] = job_family_group
            hunting_df.loc[row_index, 'Business Title'] = business_title
            hunting_df.loc[row_index, 'Job Family'] = np.nan
            hunting_df.loc[row_index, 'percent_fit'] = np.nan
        
        return hunting_df


---
# Add Next ORR

In [None]:

file_name = 'BAH1002 - Open Requisitions Report (ORR) 2020-04-03 09_02 EDT.csv'
print(file_name)
file_path = os.path.join(hunting_dir, file_name)
!"{text_editor_path}" "{os.path.abspath(file_path)}"

In [None]:

hunting_df = s.load_object('hunting_df')
hunting_df = add_new_orr(file_name, hunting_df)

In [17]:

columns_list = ['Job Posting', 'Job Requisition', 'Job Requisition ID', 'Job Requisition Status', 'Furthest Stage',
                'Supervisory Organization', 'Group', 'Account Group', 'IMT', 'Cluster', 'Primary Recruiter', 'Resource Manager',
                'Hiring Manager', 'Job Posting Title', 'Job Profile', 'Job Requisition Type', 'Management Level',
                'Primary Location', 'Primary Location State/Province', 'Primary Location Country', 'Required Clearance',
                'Clearance Agency', 'Time Type', 'Recruiting Start Date', 'Job Type', 'Job Family', 'Business Title',
                'Job Family Group', 'Job Description', 'Job Profile Skills', 'percent_fit',
                'is_opportunity_application_emailed', 'opportunity_application_email_date', 'is_remote_delivery', 'manager_notes', 'CS Notes']
hunting_df[columns_list].tail(5).T

Unnamed: 0,5129,5130,5131,5132,5133
Job Posting,Technical Solutions Science and Engineering Ma...,"Technical Writer, Senior",Test and Evaluation Engineer,Trainer,Vulnerability Cybersecurity Engineer
Job Requisition,R0081512 Technical Solutions Science and Engin...,"R0081100 Technical Writer, Senior (Open)",R0081488 Test and Evaluation Engineer (Open),R0081502 Trainer (Open),R0081454 Vulnerability Cybersecurity Engineer ...
Job Requisition ID,R0081512,R0081100,R0081488,R0081502,R0081454
Job Requisition Status,Open,Open,Open,Open,Open
Furthest Stage,Review,Review,Review,Review,Review
Supervisory Organization,Supervisory Organization (Jonathan Levitt (566...,Supervisory Organization (James Agbai (553451)),Supervisory Organization (Derick Wingler (5931...,Supervisory Organization (Edward Greene | Brad...,Supervisory Organization (Kate Knyzewski (5204...
Group,GLOBAL DEFENSE GROUP,CIVILIAN SERVICES GROUP,GLOBAL DEFENSE GROUP,GLOBAL DEFENSE GROUP,STRATEGIC INNOVATION GROUP
Account Group,NMC ACCT GROUP,HEALTH ACCT GROUP,NMC ACCT GROUP,NMC ACCT GROUP,FSO ACCT GROUP
IMT,NMC IMT,HEALTH IMT,NMC IMT,NMC IMT,CYBER & ENGINEERING IMT
Cluster,Indianapolis Cluster,Natl Business Park Cluster,Norfolk Cluster,Norfolk Cluster,Telework Cluster


In [None]:

s.store_objects(hunting_df=hunting_df)


---
# Training

In [18]:

# Rebuild the datframe from the dictionary
rows_list = [{'qualification_str': qualification_str, 'is_fit': is_fit} for qualification_str, is_fit in basic_quals_dict.items()]
basic_quals_df = pd.DataFrame(rows_list)
s.store_objects(basic_quals_df=basic_quals_df)

# Re-transform the bag-of-words and tf-idf from the new manual scores
sents_list = basic_quals_df.qualification_str.tolist()

# Bag-of-words
cv = CountVectorizer(lowercase=True, tokenizer=regex_tokenizer, token_pattern=r'\b[1-9a-zA-Z][0-9a-zA-Z]*[#\+]{0,2}', ngram_range=(1, 3))
bow_matrix = cv.fit_transform(sents_list)
s.store_objects(bq_cv_vocab=cv.vocabulary_)

# Tf-idf, must get from BOW first
tt = TfidfTransformer()
tfidf_matrix = tt.fit_transform(bow_matrix)
s.store_objects(bq_tt=tt)

# Re-train the classifier
X = tfidf_matrix.toarray()
y = basic_quals_df.is_fit.to_numpy()
fit_estimators_dict = s.load_object('fit_estimators_dict')
#basic_quals_clf = RandomForestClassifier(n_estimators=997)
#basic_quals_clf = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=50, random_state=None)
basic_quals_clf = fit_estimators_dict['LogisticRegression']
basic_quals_clf.fit(X, y)
s.store_objects(basic_quals_clf=basic_quals_clf)

# Re-calibrate the inference engine
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
def predict_percent_fit(quals_list):
    y_predict_proba = np.array([])
    if len(quals_list):
        X_test = bq_tt.transform(bq_cv.transform(quals_list)).toarray()
        y_predict_proba = basic_quals_clf.predict_proba(X_test)
    
    return y_predict_proba
print('Retraining complete')

Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\basic_quals_df.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\bq_cv_vocab.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\bq_tt.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\basic_quals_clf.pickle
Retraining complete



---
# Inference

In [19]:

# Loop through all the unset %fit values, set them if you can, break for help if you can't
match_series = (hunting_df.percent_fit >= 0.0)
for row_index, row_series in hunting_df[~match_series].iterrows():
    quals_list, job_fitness = print_fit_job(row_index, row_series, basic_quals_dict)
    if job_fitness > 0.8:
        if all(qual_str in basic_quals_dict for qual_str in quals_list):
            hunting_df.loc[row_index, 'percent_fit'] = eval(' + '.join(map(qual_sum, quals_list))) / len(quals_list)
            s.store_objects(hunting_df=hunting_df)
        else:
            break
    else:
        if len(quals_list):
            hunting_df.loc[row_index, 'percent_fit'] = eval(' + '.join(map(qual_sum, quals_list))) / len(quals_list)
            s.store_objects(hunting_df=hunting_df)
print('{}/{} = {}% completed'.format(hunting_df[match_series].shape[0], hunting_df.shape[0],
                                     int(100 * hunting_df[match_series].shape[0] / hunting_df.shape[0])))

5134/5134 = 100% completed


In [None]:

# Manually label the unscored qual
qualification_str = quals_list[3]
print(qualification_str)
basic_quals_dict[qualification_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict)


---
# Create the emails
Can you work in automation to ensure you are not emailing the same employee more than once during a 2 week span?

In [20]:

def f(x):
    
    return 'python' in str(x).lower()

def g(x):
    
    return 'polygraph' in str(x).lower()

ds_regex = re.compile(r'data *scien')
ml_regex = re.compile(r'machine *learning|\bML\b')
def ff(x):
    x = str(x).lower()
    if ds_regex.search(x) or ml_regex.search(x) or ('python' in x):
        match = True
    else:
        match = False
    
    return match

match_series = hunting_df['Job Requisition Type'].isin(['Sold and Funded', 'Sold & Unfunded'])
#match_series = match_series & hunting_df['Job Description'].map(f)
match_series = match_series & hunting_df['Job Description'].map(ff)
match_series = match_series & (hunting_df.percent_fit >= 0.85) & ~hunting_df['is_opportunity_application_emailed']
match_series = match_series & ~(hunting_df.is_remote_delivery == False)
match_series = match_series & ~hunting_df['Required Clearance'].isin(['TS/SCI', 'TS/SCI w/CIP', 'TS/SCI w/FSP'])
match_series = match_series & ~hunting_df['Job Description'].map(g)
print(hunting_df[match_series].shape)
print(hunting_df[match_series].groupby('Required Clearance').count().T.max().sort_values(ascending=False))

(0, 36)
Series([], dtype: float64)


In [None]:

hunting_df[match_series].head(5).T

In [21]:

email_tuples_list = print_emails(match_series)

In [None]:

from datetime import datetime

# Manually note the email has been sent
req_id = 'R0077973'.strip()
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_opportunity_application_emailed'] = True
date_obj = datetime.now()
hunting_df.loc[match_series, 'opportunity_application_email_date'] = pd.Timestamp(date_obj)
s.store_objects(hunting_df=hunting_df)
last_emailed_dict = s.load_object('last_emailed_dict')
row_series = hunting_df.loc[match_series].iloc[0]
email_tuple = get_email_tuple(row_series)
for email in email_tuple:
    last_emailed_dict[email] = pd.Timestamp(date_obj)
s.store_objects(last_emailed_dict=last_emailed_dict)


---
# Flag setting

In [None]:

# Set notes for this one job
req_id = 'R0073507'
match_series = (hunting_df['Job Requisition ID'] == req_id)
manager_notes = hunting_df.loc[match_series, 'manager_notes'].tolist()[0]
if len(manager_notes):
    manager_notes += ' '
manager_notes += 'The position is put on hold right now.'
manager_notes += ''
hunting_df.loc[match_series, 'manager_notes'] = manager_notes
s.store_objects(hunting_df=hunting_df)

In [None]:

# Set remote delivery for this one job
req_id = 'R0073507'
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_remote_delivery'] = False
s.store_objects(hunting_df=hunting_df)