
---
# Load needed libraries and functions

In [1]:

%run ../../load_magic/storage.py
%run ../../load_magic/paths.py
%run ../../load_magic/lists.py
%run ../../load_magic/environment.py
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
%pprint

notebook_path = get_notebook_path()
print(notebook_path)

s = Storage()
print(['s.{}'.format(fn) for fn in dir(s) if not fn.startswith('_')])
hunting_df = s.load_object('hunting_df')
basic_quals_df = s.load_object('basic_quals_df')
basic_quals_dict = s.load_object('basic_quals_dict')
basic_quals_clf = s.load_object('basic_quals_clf')
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_tt = s.load_object('bq_tt')
dir()

Pretty printing has been turned OFF
D:\Documents\Repositories\notebooks\Miscellaneous\ipynb\Job Hunting.ipynb
['s.attempt_to_pickle', 's.data_csv_folder', 's.data_folder', 's.encoding_type', 's.load_csv', 's.load_dataframes', 's.load_object', 's.save_dataframes', 's.saves_csv_folder', 's.saves_folder', 's.saves_pickle_folder', 's.store_objects']


['AdaBoostClassifier', 'Config', 'CountVectorizer', 'In', 'Out', 'RandomForestClassifier', 'SequenceMatcher', 'Storage', 'TfidfTransformer', '_', '__', '___', '__builtin__', '__builtins__', '__doc__', '__loader__', '__name__', '__nonzero__', '__package__', '__spec__', '_dh', '_i', '_i1', '_ih', '_ii', '_iii', '_oh', 'basic_quals_clf', 'basic_quals_df', 'basic_quals_dict', 'bq_cv_vocab', 'bq_tt', 'check_4_doubles', 'check_for_typos', 'conjunctify_list', 'copyfile', 'csv', 'encoding', 'exit', 'get_classifier', 'get_data_structs_df', 'get_datastructure_prediction', 'get_dir_tree', 'get_git_lfs_track_commands', 'get_importances', 'get_input_sample', 'get_ipython', 'get_module_version', 'get_notebook_path', 'get_specific_gitignore_files', 'get_struct_name', 'humanize_bytes', 'hunting_df', 'ipykernel', 'json', 'jupyter_config_dir', 'notebook_path', 'notebookapp', 'np', 'os', 'pd', 'pickle', 'preprocess_data', 'print_all_files_ending_starting_with', 'print_all_files_ending_with', 'print_all_f


---
# Needed extra functions

In [2]:

# Email prep
subject_str = '{}% fit: Internal Candidate, Dave Babbitt, for {}'
concerns_str = 'One important question I have is if the work can be supported remotely or if this position is available for remote delivery '
concerns_str += '(or something equivalent).'
concerns_str += " I don't want to move my family out of New England."
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
emails_dir = os.path.join(s.saves_folder, 'emails')
os.makedirs(name=emails_dir, exist_ok=True)
name_regex = re.compile(r'^([^(]+) \(\d+\)"')
def clean_email(email_str):
    match_obj = name_regex.search(email_str)
    if match_obj:
        email_str = match_obj.group(1)
    names_list = re.split(r'\s+', email_str, 0)
    if len(names_list) >= 2:
        first_name = names_list[0]
        last_name = names_list[1]
        email_str = '{}, {} [USA] <{}_{}@bah.com>'.format(last_name, first_name, last_name.lower(), first_name.lower())
    
    return email_str

In [3]:

def get_percent_fit(row_series):
    percent_fit = row_series['percent_fit']
    if str(percent_fit) == 'nan':
        percent_fit = 0
    percent_fit = int(percent_fit*100)
    
    return percent_fit

In [4]:

def print_loc_computation(row_index, quals_list, basic_quals_dict):
    print()
    numerator_str_list = []
    for qual_str in quals_list:
        if qual_str in basic_quals_dict:
            numerator_str_list.append(str(basic_quals_dict[qual_str]))
        else:
            numerator_str_list.append('000')
    numerator_str = '+'.join(numerator_str_list)
    print("hunting_df.loc[{}, 'percent_fit'] = ({})/{}".format(row_index, numerator_str, len(quals_list)))

In [5]:

def get_predictions_and_counts(prediction_list, quals_list):
    qual_count = 0
    prediction_str = ''
    for pred_array, qual_str in zip(prediction_list, quals_list):
        prediction = pred_array[1]
        prediction_str += '\n{} {}'.format(prediction, qual_str)
        if prediction > 0.5:
            qual_count += 1
    
    return prediction_str, qual_count

In [6]:

def get_quals_str(prediction_list, quals_list, basic_quals_dict):
    qual_count = 0
    quals_str = ''
    for pred_array, (i, qual_str) in zip(prediction_list, enumerate(quals_list)):
        if qual_str in basic_quals_dict:
            formatted_str = '\nquals_list[{}] = "{}" ({})'
        else:
            formatted_str = '\n*quals_list[{}] = "{}" ({})'
        prediction = pred_array[1]
        quals_str += formatted_str.format(i, qual_str, prediction)
        if prediction > 0.5:
            qual_count += 1
    
    return quals_str, qual_count

In [7]:

def print_fit_job(row_index, row_series, basic_quals_dict):
    job_fitness = 0.0
    job_description = row_series['Job Description']
    quals_list = get_quals_list(job_description)
    if len(quals_list):
        prediction_list = list(predict_percent_fit(quals_list))
        #prediction_str, qual_count = get_predictions_and_counts(prediction_list, quals_list)
        quals_str, qual_count = get_quals_str(prediction_list, quals_list, basic_quals_dict)
        job_fitness = qual_count/len(prediction_list)
        if job_fitness > 0.8:
            print('Basic Qualifications:{}'.format(quals_str))
            #print(prediction_str)
            print(job_fitness)
            print_loc_computation(row_index, quals_list, basic_quals_dict)
    
    return quals_list, job_fitness

In [8]:

def qual_sum(qual_str):
    results = '"{}"'.format(qual_str)
    if qual_str in basic_quals_dict:
        results = basic_quals_dict[qual_str]
    else:
        results = predict_percent_fit([qual_str])[0][1]
        if results > 0.5:
            results = 1.0
        else:
            results = 0.0
    
    return str(results)

In [9]:

def print_job_description(req_id):
    match_series = (hunting_df['Job Requisition ID'] == req_id)
    job_description = hunting_df[match_series]['Job Description'].tolist()[0]
    print(get_quals_list(job_description))
    print(job_description)

In [10]:

scanner_regex = re.compile(r'\b[1-9a-zA-Z][0-9a-zA-Z]*( *[#\+]{1,2}|\b)')
def regex_tokenizer(corpus):
    
    return [match.group() for match in re.finditer(scanner_regex, corpus)]

In [11]:

a_list = ['Additional Qualifications?', 'Nice If You Have', 'Nice if you have', 'Nice if You Have',
          'Additional Preferred Qualifications', 'Nice if you Have', 'Additional qualifications']
a_str = '({}):?'.format('|'.join(a_list))
def get_quals_list(job_description):
    job_description = re.sub('Â', '', job_description)
    basic_quals = ''
    quals_list = []
    items_list = re.split('(Key Role|The Challenge):', job_description, 0)
    if len(items_list) > 1:
        job_description = items_list[-1].strip()
    items_list = re.split('[\r\n]+(Basic Qualifications?|You Have|You have):?', job_description, 0)
    if len(items_list) > 1:
        job_description = items_list[-1].strip()
    items_list = re.split(a_str, job_description, 0)
    if len(items_list) > 1:
        basic_quals = items_list[0].strip()
    else:
        items_list = re.split('(Clearance|Build Your Career):', job_description, 0)
        basic_quals = items_list[0].strip()
    if basic_quals != '':
        quals_list = [re.sub(r'â¯', ' ', q) for q in re.split('[\r\n]+', basic_quals, 0)]
        quals_list = [re.sub(r'^[?â-]+', '', x).strip() for x in quals_list]
        quals_list = [re.sub(r'[â-]+$', '', x).strip() for x in quals_list]
        quals_list = [re.sub(r'â', '-', x).strip() for x in quals_list]
        quals_list = [re.sub(r'â', '`', x).strip() for x in quals_list]
        quals_list = [re.sub(u'\\xa0', u' ', x).strip() for x in quals_list]
        quals_list = [re.sub(r'\s+$', '', x) for x in quals_list]
        quals_list = [x for x in quals_list if x != '']
    
    return quals_list

In [12]:

def print_emails(match_series):
    for file_name in os.listdir(emails_dir):
        if file_name.endswith('.txt'):
            file_path = os.path.join(emails_dir, file_name)
            os.remove(file_path)
    for row_index, row_series in hunting_df[match_series].sort_values('percent_fit', ascending=False).iterrows():
        req_str = row_series['Job Requisition']
        percent_fit = get_percent_fit(row_series)
        sents_list = ['I’m submitting my resume for {}.'.format(req_str),
                      'I’ve reviewed the basic qualifications and believe I’m a good fit for this project.',
                      'Below is a breakdown of the requirements and the amount of experience I have with each.',
                      'I’m available at your convenience to discuss my qualifications and look forward to hearing from you.']
        blurb_str = ' '.join(sents_list)
        hm_str = row_series['Hiring Manager']
        pr_str = row_series['Primary Recruiter']
        job_description = row_series['Job Description']
        quals_list = get_quals_list(job_description)
        quals_str = '\n•\t' + '\n•\t'.join(quals_list)
        file_path = os.path.join(emails_dir, '{}_email.txt'.format(row_series['Job Requisition ID'].strip()))
        if not os.path.isfile(file_path):
            with open(file_path, 'w', encoding=s.encoding_type) as io_wrapper:
                print('', file=io_wrapper)
                print('To: {}; {}'.format(clean_email(hm_str), clean_email(pr_str)), file=io_wrapper)
                print('CC: Safi, Claudia [USA] <safi_claudia@bah.com>; Borrelli, Bill [USA] <Borrelli_Bill@bah.com>', file=io_wrapper)
                print(subject_str.format(percent_fit, req_str), file=io_wrapper)
                print('', file=io_wrapper)
                print('Dear {},'.format(hm_str.split(' ')[0]), file=io_wrapper)
                print('', file=io_wrapper)
                print('{}'.format(blurb_str), file=io_wrapper)
                print('', file=io_wrapper)
                print('Basic Qualifications:{}'.format(quals_str), file=io_wrapper)
                print('', file=io_wrapper)
                print(concerns_str, file=io_wrapper)
                print('', file=io_wrapper)
                print('Attached: Dave_Babbitt_Resume_for_{}.pdf'.format('_'.join(re.split(r'[ \\\/:\*\?"><\|]+', req_str, 0))),
                      file=io_wrapper)
            !"{text_editor_path}" "{os.path.abspath(file_path)}"
    !start %windir%\explorer.exe "{os.path.abspath(emails_dir)}"


---

In [122]:

# Rebuild the datframe from the dictionary
rows_list = [{'qualification_str': qualification_str, 'is_fit': is_fit} for qualification_str, is_fit in basic_quals_dict.items()]
basic_quals_df = pd.DataFrame(rows_list)
s.store_objects(basic_quals_df=basic_quals_df)

# Re-transform the bag-of-words and tf-idf from the new manual scores
sents_list = basic_quals_df.qualification_str.tolist()

# Bag-of-words
cv = CountVectorizer(lowercase=True, tokenizer=regex_tokenizer, token_pattern=r'\b[1-9a-zA-Z][0-9a-zA-Z]*[#\+]{0,2}', ngram_range=(1, 2))
bow_matrix = cv.fit_transform(sents_list)
s.store_objects(bq_cv_vocab=cv.vocabulary_)

# Tf-idf, must get from BOW first
tt = TfidfTransformer()
tfidf_matrix = tt.fit_transform(bow_matrix)
s.store_objects(bq_tt=tt)

# Re-train the classifier
X = tfidf_matrix.toarray()
y = basic_quals_df.is_fit.to_numpy()
basic_quals_clf = RandomForestClassifier(n_estimators=997)
#basic_quals_clf = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=RandomForestClassifier(n_estimators=997),
#                                     learning_rate=1.0, n_estimators=50, random_state=None)
basic_quals_clf.fit(X, y)
s.store_objects(basic_quals_clf=basic_quals_clf)

# Re-calibrate the inference engine
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
def predict_percent_fit(quals_list):
    y_predict_proba = np.array([])
    if len(quals_list):
        X_test = bq_tt.transform(bq_cv.transform(quals_list)).toarray()
        y_predict_proba = basic_quals_clf.predict_proba(X_test)
    
    return y_predict_proba

Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\basic_quals_df.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\bq_cv_vocab.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\bq_tt.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\basic_quals_clf.pickle



---
# Look for a fit greater than 80%

In [123]:

# Loop through all the unset %fit values, set them if you can, break for help if you can't
match_series = (hunting_df.percent_fit >= 0.0)
for row_index, row_series in hunting_df[~match_series].iterrows():
    quals_list, job_fitness = print_fit_job(row_index, row_series, basic_quals_dict)
    if job_fitness > 0.8:
        if all(qual_str in basic_quals_dict for qual_str in quals_list):
            hunting_df.loc[row_index, 'percent_fit'] = eval(' + '.join(map(qual_sum, quals_list))) / len(quals_list)
            s.store_objects(hunting_df=hunting_df)
        else:
            break
    else:
        if len(quals_list):
            hunting_df.loc[row_index, 'percent_fit'] = eval(' + '.join(map(qual_sum, quals_list))) / len(quals_list)
            s.store_objects(hunting_df=hunting_df)
print('{}/{} = {}% completed'.format(hunting_df[match_series].shape[0], hunting_df.shape[0],
                                     int(100 * hunting_df[match_series].shape[0] / hunting_df.shape[0])))

Basic Qualifications:
quals_list[0] = "5+ years of experience with IT" (0.6639919759277834)
quals_list[1] = "Experience with building Python assessment features" (0.6288866599799399)
quals_list[2] = "Experience with adding new Jenkins and Maven plugins" (0.31795386158475425)
quals_list[3] = "Ability to collaborate with others effectively" (0.6619859578736209)
quals_list[4] = "Ability to learn new concepts and technologies quickly" (0.720160481444333)
quals_list[5] = "Ability to travel up to 70% of the time" (0.7151454363089268)
quals_list[6] = "HS diploma or GED" (0.831494483450351)
0.8571428571428571

hunting_df.loc[185, 'percent_fit'] = (1+0+0+1+1+1+1)/7
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\hunting_df.pickle
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\hunting_df.pickle
Basic Qualifications:
*quals_list[0] = "2+ years of experience with Azure, AWS, or Google Cloud platforms" (0.6529588766298897)
*quals_list[1] = "2+ 

In [127]:

# Manually label the unscored qual
qualification_str = quals_list[3]
print(qualification_str)
basic_quals_dict[qualification_str] = 1
s.store_objects(basic_quals_dict=basic_quals_dict)

Ability to script in a language such as Bash or Python
Pickling to D:\Documents\Repositories\notebooks\Miscellaneous\saves\pickle\basic_quals_dict.pickle



---
# Manually score unscored jobs

In [None]:

hunting_df.loc[436, 'percent_fit'] = (0+1+1+1+1+1)/6
s.store_objects(hunting_df=hunting_df)


---
# Rescore the quals dataframe

In [None]:

from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier

X = tfidf_matrix.toarray()
y = basic_quals_df.is_fit.to_numpy()
estimators_list = [AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0, n_estimators=50, random_state=None),
                   BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False, max_features=1.0, max_samples=1.0,
                                     n_estimators=10, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False),
                   ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None,
                                        max_features='auto', max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0,
                                        min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None, oob_score=False, random_state=None, verbose=0, warm_start=False),
                   GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3,
                                              max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                              min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100,
                                              n_iter_no_change=None, presort='deprecated', random_state=None, subsample=1.0, tol=0.0001,
                                              validation_fraction=0.1, verbose=0, warm_start=False),
                   RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features='auto',
                                          max_leaf_nodes=None, max_samples=None, min_impurity_decrease=0.0, min_impurity_split=None,
                                          min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
                                          oob_score=False, random_state=None, verbose=0, warm_start=False)]
estimators_list = [clf.fit(X, y) for clf in estimators_list]

In [None]:

for clf in estimators_list:
    clf_name = str(type(clf)).split('.')[-1].split("'")[0]
    basic_quals_df[clf_name] = np.nan
    for row_index, row_series in basic_quals_df.iterrows():
        qualification_str = row_series.qualification_str
        X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
        y_predict_proba = clf.predict_proba(X_test)[0][1]
        basic_quals_df.loc[row_index, clf_name] = y_predict_proba
    #break

In [None]:

clf = sklearn.ensemble.StackingClassifier(estimators=[(str(type(e)).split('.')[-1].split("'")[0], e) for e in estimators_list],
                                          final_estimator=None, cv=None, stack_method='auto', n_jobs=None, passthrough=False, verbose=0)
clf_name = str(type(clf)).split('.')[-1].split("'")[0]
basic_quals_df[clf_name] = np.nan
clf.fit(X, y)

# Re-score the quals dataframe
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
for row_index, row_series in basic_quals_df.iterrows():
    qualification_str = row_series.qualification_str
    X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
    y_predict_proba = clf.predict_proba(X_test)[0][1]
    basic_quals_df.loc[row_index, clf_name] = y_predict_proba

In [None]:

clf = sklearn.ensemble.VotingClassifier(estimators=[(str(type(e)).split('.')[-1].split("'")[0], e) for e in estimators_list],
                                        voting='soft', weights=None, n_jobs=None, flatten_transform=True)
clf_name = str(type(clf)).split('.')[-1].split("'")[0]
basic_quals_df[clf_name] = np.nan
clf.fit(X, y)

In [None]:

# Re-score the quals dataframe
bq_cv_vocab = s.load_object('bq_cv_vocab')
bq_cv = CountVectorizer(vocabulary=bq_cv_vocab)
bq_cv._validate_vocabulary()
bq_tt = s.load_object('bq_tt')
for row_index, row_series in basic_quals_df.iterrows():
    qualification_str = row_series.qualification_str
    X_test = bq_tt.transform(bq_cv.transform([qualification_str])).toarray()
    y_predict_proba = clf.predict_proba(X_test)[0][1]
    basic_quals_df.loc[row_index, clf_name] = y_predict_proba

In [None]:

basic_quals_df.sample(5).T

In [None]:

columns_list = ['qualification_str', 'is_fit', 'AdaBoostClassifier', 'BaggingClassifier', 'ExtraTreesClassifier',
                'GradientBoostingClassifier', 'RandomForestClassifier', 'StackingClassifier', 'VotingClassifier']
basic_quals_df = basic_quals_df[columns_list]
s.store_objects(basic_quals_df=basic_quals_df)

In [None]:

from scipy.stats import entropy

clf_name_list = [clf for clf in basic_quals_df.columns if clf.endswith('Classifier')]
pk_list = basic_quals_df.is_fit.tolist()
columns_list = ['clf_name', 'boundary_diff', 'clf_entropy', 'relative_entropy']
rows_list = []
for column_name in clf_name_list:
    match_series = (basic_quals_df.is_fit == 1)
    upper_bound = basic_quals_df[match_series][column_name].min()
    match_series = (basic_quals_df.is_fit == 0)
    lower_bound = basic_quals_df[match_series][column_name].max()
    qk_list = basic_quals_df[column_name].tolist()
    row_dict = {}
    row_dict['clf_name'] = column_name
    row_dict['boundary_diff'] = upper_bound-lower_bound
    row_dict['clf_entropy'] = entropy(pk=qk_list)
    row_dict['relative_entropy'] = entropy(pk=pk_list, qk=qk_list)
    rows_list.append(row_dict)
entropy_df = pd.DataFrame(rows_list, columns=columns_list)
entropy_df.sort_values('clf_entropy', ascending=False)

In [None]:

match_series = (basic_quals_df.is_fit == 1)
basic_quals_df[match_series]['predict_proba'].min()
#basic_quals_df.columns.tolist()

In [None]:

match_series = (basic_quals_df.is_fit == 0)
basic_quals_df[match_series]['predict_proba'].max()
#basic_quals_df.columns.tolist()


---
# Create the emails

In [None]:

def f(x):
    
    return 'python' in str(x).lower()

match_series = hunting_df['Job Requisition Type'].isin(['Sold and Funded', 'Sold & Unfunded']) & hunting_df['Job Description'].map(f)
match_series = match_series & (hunting_df.percent_fit >= 0.85) & ~hunting_df['is_opportunity_application_emailed']
match_series = match_series & ~(hunting_df.is_remote_delivery == False)
match_series = match_series & ~hunting_df['Required Clearance'].isin(['TS/SCI', 'TS/SCI w/CIP'])
print(hunting_df[match_series].shape)
print(hunting_df[match_series].groupby('Required Clearance').count().T.max().sort_values(ascending=False))
hunting_df[match_series].head(5).T

In [None]:

print_emails(match_series)


---
# Flag setting

In [None]:

# Manually note the email has been sent
req_id = 'R0062296'.strip()
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_opportunity_application_emailed'] = True
s.store_objects(hunting_df=hunting_df)

In [None]:

# Set remote delivery for this list of jobs
req_id_list = ['R0073564', 'R0073583', 'R0073584', 'R0073585', 'R0073586']
match_series = hunting_df['Job Requisition ID'].isin(req_id_list)
hunting_df.loc[match_series, 'is_remote_delivery'] == True
s.store_objects(hunting_df=hunting_df)

In [None]:

# Set remote delivery for this one job
req_id = 'R0069681'
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_remote_delivery'] == False
s.store_objects(hunting_df=hunting_df)

In [None]:

# Set university recruiting for this job
req_id = 'R0066388'
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_for_university_recruiting'] = 1
s.store_objects(hunting_df=hunting_df)

In [None]:

# Manually note you can't do this job from home
req_id = 'R0064764'
match_series = (hunting_df['Job Requisition ID'] == req_id)
hunting_df.loc[match_series, 'is_remote_delivery'] = False
s.store_objects(hunting_df=hunting_df)

In [None]:

# Manually note you can't do this job from home
hunting_df.loc[83, 'is_remote_delivery'] = False
s.store_objects(hunting_df=hunting_df)

In [None]:

match_series = (hunting_df.index == 2616)
hunting_df.loc[match_series, 'is_for_university_recruiting'] = 1
hunting_df.loc[match_series, 'percent_fit'] = 0.0
s.store_objects(hunting_df=hunting_df)


---
# Data Exploration

In [None]:

basic_quals_dict['Ability to operate independently and manage staff'] = 0
s.store_objects(basic_quals_dict=basic_quals_dict)

In [None]:

req_id = 'R0073507'
match_series = (hunting_df['Job Requisition ID'] == req_id)
print(hunting_df[match_series]['percent_fit'].tolist())
for row_index, row_series in hunting_df[match_series].iterrows():
    quals_list, job_fitness = print_fit_job(row_index, row_series, basic_quals_dict)

In [None]:

hunting_df.loc[504, 'percent_fit'] = (1+1+1+1+0+1+1)/7
s.store_objects(hunting_df=hunting_df)

In [47]:

req_id_list = ['R0073564', 'R0073583', 'R0073584', 'R0073585', 'R0073586']
match_series = hunting_df['Job Requisition ID'].isin(req_id_list)
hunting_df[match_series].T

Unnamed: 0,426,427,428,475,1390
Hiring Manager,John Brandom (568067),John Brandom (568067),John Brandom (568067),John Brandom (568067),John Brandom (568067)
Management Level,Senior Consultant,Senior Consultant,Senior Consultant,Senior Consultant,Senior Consultant
IMT,JCC IMT,JCC IMT,JCC IMT,JCC IMT,JCC IMT
Job Requisition,R0073583 Data Engineer (Open),R0073584 Data Engineer (Open),R0073564 Data Engineer (Open),R0073585 Data Scientist (Open),R0073586 Software Developer (Open)
Job Requisition Type,Sold and Funded,Sold and Funded,Sold and Funded,Sold and Funded,Sold and Funded
Cluster,Fayetteville Cluster,Fayetteville Cluster,Fayetteville Cluster,Fayetteville Cluster,Fayetteville Cluster
Time Type,Full time,Full time,Full time,Full time,Full time
Job Posting Title,Data Engineer,Data Engineer,Data Engineer,Data Scientist,Software Developer
Safi Recommendation,0,0,0,0,0
Recruiting Start Date,11/20/2019,11/20/2019,11/20/2019,11/20/2019,11/20/2019


In [None]:

hunting_df['Job Description'] = hunting_df['Job Description'].map(lambda x: re.sub(r'', "'", x))
hunting_df['Job Description'] = hunting_df['Job Description'].map(lambda x: re.sub(r'', '—', x))
s.store_objects(hunting_df=hunting_df)
req_id = 'R0073585'
print_job_description(req_id)

In [None]:

match_series = (hunting_df.index == 437)
print(hunting_df[match_series]['Job Description'].tolist()[0])

In [None]:

match_series = (hunting_df.percent_fit >= 0.0)
print(hunting_df[~match_series].sample(1)['Job Description'].tolist()[0])

In [None]:

#print(['hunting_df.{}'.format(fn) for fn in dir(hunting_df) if 'dup' in fn.lower()])
match_series = hunting_df.duplicated(subset='Job Requisition ID', keep=False)
print(hunting_df[match_series].shape)

In [None]:

columns_list = ['Hiring Manager', 'Management Level', 'IMT', 'Job Requisition', 'Job Requisition Type', 'Cluster', 'Time Type',
                'Job Posting Title', 'Recruiting Start Date', 'Account Group', 'Job Requisition ID', 'Job Type',
                'Supervisory Organization', 'Clearance Agency', 'Primary Location State/Province', 'Furthest Stage',
                'Resource Manager', 'Primary Location', 'Job Description', 'Group', 'Job Profile', 'Job Family Group', 'FSO',
                'Job Family', 'Job Requisition Status', 'Business Title', 'Job Posting', 'Primary Location Country',
                'Required Clearance', 'Primary Recruiter']
hunting_df = hunting_df.drop_duplicates(subset=columns_list, ignore_index=True)
s.store_objects(hunting_df=hunting_df)

In [None]:

idx_list = hunting_df[match_series].index.tolist()
first = idx_list[0]
second = idx_list[1]
columns_list = []
for column_name in hunting_df.columns:
    if hunting_df.loc[first, column_name] == hunting_df.loc[second, column_name]:
        columns_list.append(column_name)
columns_list

In [None]:

match_series = hunting_df.percent_fit.isnull()
print(hunting_df[match_series].shape)
req_id = hunting_df.loc[481, 'Job Requisition ID']
print_job_description(req_id)

In [None]:

print(['{}'.format(fn) for fn in hunting_df.columns if 'req' in fn.lower()])

In [None]:

key_regex = re.compile(r'([^0-9A-Za-z\+ \/)(:,]+)-')
for old_key in basic_quals_dict.keys():
    match_obj = key_regex.search(old_key)
    if match_obj:
        print('"{}": {}'.format(match_obj.group(1), old_key))
        #new_key = re.sub('^[?â-]+', '', old_key)
        #print(new_key)
        #basic_quals_dict[new_key] = basic_quals_dict.pop(old_key)
        break

In [None]:

key_regex = re.compile(r'\s+$')
old_key_list = basic_quals_dict.copy().keys()
for old_key in old_key_list:
    match_obj = key_regex.search(old_key)
    if match_obj:
        #print('"{}": {}'.format(match_obj.group(1), old_key))
        new_key = re.sub(r'\s+$', '', old_key)
        #print(new_key)
        basic_quals_dict[new_key] = basic_quals_dict.pop(old_key)
        #print(old_key)
        #break


---
# Utilities

In [None]:

# Add new ORR to the hunting dataframe
file_name = 'BAH1002 - Open Requisitions Report (ORR) 2020-02-14 09_03 EST.csv'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
jd_cn = 'Job Description'
reqid_cn = 'Job Requisition ID'
dupe_columns_list = ['Hiring Manager', 'Management Level', 'IMT', 'Job Requisition', 'Job Requisition Type', 'Cluster', 'Time Type',
                     'Job Posting Title', 'Recruiting Start Date', 'Account Group', reqid_cn, 'Job Type',
                     'Supervisory Organization', 'Clearance Agency', 'Primary Location State/Province', 'Furthest Stage',
                     'Resource Manager', 'Primary Location', jd_cn, 'Group', 'Job Profile', 'Job Family Group', 'FSO',
                     'Job Family', 'Job Requisition Status', 'Business Title', 'Job Posting', 'Primary Location Country',
                     'Required Clearance', 'Primary Recruiter']
columns_list = ['Job Posting', 'Job Requisition', reqid_cn, 'Job Requisition Status', 'Furthest Stage',
                'Supervisory Organization', 'Group', 'Account Group', 'IMT', 'Cluster', 'FSO', 'Primary Recruiter',
                'Resource Manager', 'Hiring Manager', 'Job Posting Title', 'Job Profile', 'Job Requisition Type',
                'Management Level', 'Primary Location', 'Primary Location State/Province', 'Primary Location Country',
                'Required Clearance', 'Clearance Agency', 'Time Type', 'Recruiting Start Date', 'Job Type', 'Job Family',
                'Business Title', 'Job Family Group', jd_cn]
hunting_dir = r'D:\Documents\Administrivia\Job Hunting\csv'
def add_new_orr(file_name):
    file_path = os.path.join(hunting_dir, file_name)
    if os.path.isfile(file_path):
        !"{text_editor_path}" "{os.path.abspath(file_path)}"
        df = pd.read_csv(file_path, header=0, skiprows=0, encoding='iso8859-1')
        df.columns = columns_list
        req_id_list = hunting_df[reqid_cn].unique().tolist()
        match_series = (df[reqid_cn].isin(req_id_list))
        hunting_df = pd.concat([hunting_df, df[~match_series]]).fillna({'is_opportunity_application_emailed': False})
        hunting_df[jd_cn] = hunting_df[jd_cn].map(lambda x: re.sub(u'\\xa0', u' ', x))
        hunting_df[jd_cn] = hunting_df[jd_cn].map(lambda x: re.sub(r'', "'", x))
        hunting_df[jd_cn] = hunting_df[jd_cn].map(lambda x: re.sub(r'', '—', x))
        hunting_df = hunting_df.drop_duplicates(subset=dupe_columns_list, ignore_index=True)
        hunting_df.reset_index(drop=True, inplace=True)
        s.store_objects(hunting_df=hunting_df)

In [None]:

basic_quals_dict = s.load_object('basic_quals_dict')
def get_basic_quals(row_index):
    match_series = (hunting_df.index == row_index)
    for row_index, row_series in hunting_df[match_series].iterrows():
        percent_fit = get_percent_fit(row_series)
        job_description = row_series['Job Description']
        quals_list = get_quals_list(job_description)
        quals_str = get_quals_str(quals_list, basic_quals_dict)
        if (quals_str != '') and (percent_fit == 0):
            print('Basic Qualifications:{}'.format(quals_str))
            print_loc_computation(row_index, quals_list, basic_quals_dict)
    
    return quals_list


---
# Study of the Safi recommendations

In [None]:

import random

match_series = (hunting_df['Safi Recommendation'] == 1)
[c[10:100].strip() for c in random.choices(population=hunting_df[match_series]['Job Description'].unique(), k=10)]

In [None]:

s.store_objects(hunting_df=hunting_df)
match_series = (hunting_df['Safi Recommendation'] == 1)
hunting_df[match_series]['Primary Location State/Province'].unique().tolist()

In [None]:

hunting_df[match_series]['Job Requisition'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Cluster'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Job Family'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Account Group'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Resource Manager'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Job Requisition Type'].unique()[:10].tolist()

In [None]:

hunting_df[match_series]['Job Posting'].unique()[:10].tolist()

In [None]:

item_list = population=hunting_df[match_series]['Hiring Manager'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['IMT'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Required Clearance'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Job Profile'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Management Level'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Clearance Agency'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Group'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Job Profile'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

[cn for cn in columns_list if 'loca' in cn.lower()]

In [None]:

item_list = population=hunting_df[match_series]['Primary Location'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])

In [None]:

item_list = population=hunting_df[match_series]['Primary Recruiter'].unique().tolist()
if len(item_list) > 10:
    print(random.choices(item_list, k=10))
else:
    print(item_list[:10])


---
# Initial dataframe creation (don't run again)

In [None]:

hunting_dir = r'D:\Documents\Administrivia\Job Hunting'
columns_list = []
for root, dirs, files in os.walk(hunting_dir):
    #path = root.split(os.sep)
    #print((len(path)-1) * '---', os.path.basename(root))
    for file in files:
        #print(len(path) * '---', file)
        if file.endswith('.csv'):
            print(file)
            file_name = os.path.join(hunting_dir, file)
            if os.path.isfile(file_name):
                df = pd.read_csv(file_name, encoding='iso8859-1')
                columns_list = list(set(columns_list) | set(df.columns.tolist()))

In [None]:

hunting_df = pd.DataFrame([], columns=columns_list)

for root, dirs, files in os.walk(hunting_dir):
    for file in files:
        if file.endswith('.csv'):
            file_name = os.path.join(hunting_dir, file)
            if os.path.isfile(file_name):
                df = pd.read_csv(file_name, encoding='iso8859-1')
                hunting_df = pd.concat([hunting_df, df])


---

In [None]:

command_str = '{sys.executable} -m pip install pyOutlook'.format(sys=sys)
print(command_str)
!{command_str}