In [5]:
from analytics_utils.db_utils import get_db_client, SupportedDBs
from hs_gimme.account_settings_manager import get_account_settings
from hs_gimme.application_status_history_classifier.machine_learning_status_classifier import \
    get_machine_learning_status_classifier
from tqdm import tqdm
from analytics_configs.mng.sources import ANALYTICS_BUCKET_TO_PHASE_ID_MAPPING
from hs_tree_blenders_creation.batch_runners.data_sets import get_account_train_set, get_account_threshold_set, get_account_test_set
from datetime import datetime

In [6]:
ACCOUNTS = ['oxford', 'seattle', 'porto', 'antalya', 'tampa', 'moscow']
#ACCOUNTS = ['seattle', 'denver', 'antwerp',]
ENV = 'production'
SAMPLE_SIZE = 500

In [7]:
def get_statuses_names(app):
    statuses = app['ats_application']['status_info']['status_history']
    try:
        status_names = [s.get('status_name', '') or s.get('step_name', '') for s in statuses]
    except:
        print(statuses)
        raise
    return '->'.join(status_names)

In [8]:
data = []
for account in ACCOUNTS:
    print(account)
    mongo = gmcdb(ENV, account)
    account_settings = get_account_settings(account)
    integration = account_settings.get('ats_integration_type')
    phases_order_mapping = {item['id']: order for order, item in
                            enumerate(account_settings.get('candidate_phases') or [])}
    status_cls = get_machine_learning_status_classifier(ENV, account)
    
    
    #req_ids = mongo.req.distinct('_id', {'is_enabled': True, 'is_gradable': True, 'job_create_date': {'$gt': datetime(2023, 1, 1)}})
    req_ids = get_account_test_set(account)
    print(len(req_ids), 'reqs in test set')
    apps = list(mongo.application.find({'samurai_json.experience': {'$gt': []}, 'req_id': {'$in': req_ids},
                                     'ats_application.status_info.status_history': {'$gt': []}, }).limit(SAMPLE_SIZE))
    
    
    if not apps:
        continue
    dataset =  account + '_' + ENV

    db_client = get_db_client(ENV, account, SupportedDBs.BIGQUERY)
    
    ids = ','.join(['"' + a['_id'] + '"' for a in apps])

    sql =f"""select * from {dataset}.dwh_hs_fact_application_status_history 
    where application_id IN ({ids})
    """

    job_result = db_client.execute(sql)

    job_df = job_result['query_reults'].to_dataframe()

    for app in tqdm(apps):
        max_status = status_cls.get_max_status(app)
        phases = status_cls.get_phases_reach_dates(app)
        phases_path = '->'.join([p[0] for p in sorted(phases.items(), key=lambda x: x[1])])
        app_df = job_df[job_df['application_id'] == app['_id']].sort_values('status_ts').fillna("NONE")
        analytics_path = '->'.join(app_df['analytics_bucket'])
        max_analytics_bucket = ','.join(app_df['max_analytics_bucket_reached'].unique())
        category_path = '->'.join(app_df['status_category'])
        status_path = '->'.join(app_df['status_code'])
        

        data.append({
            'account': account,
            'app_id': app['_id'],
            'max_status': max_status,
            'app': app,
            'phases_path': phases_path,
            'statuses_names': get_statuses_names(app),
            'integration': integration,
            'analytics_path': analytics_path,
            'max_analytics_bucket': max_analytics_bucket,
            'category_path': category_path,
            'status_path': status_path
        })

len(data)

oxford
9749 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1076.84it/s]


seattle
[2m2023-10-18T06:57:48.566863Z[0m [[32m[1minfo     [0m] [1mRunning DVC command ['dvc', 'pull', PosixPath('/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/datasets/spotlight_seattle/data')][0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/dvc_manager/dvc_facade.py[0m [36mfunction_name[0m=[35m_run[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mline_number[0m=[35m31[0m [36mmodule[0m=[35mhs_brain_mlops.dataset_management.dvc_manager.dvc_facade[0m [36mpid[0m=[35m62832[0m
2555 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1060.48it/s]


porto
[2m2023-10-18T06:58:04.056633Z[0m [[32m[1minfo     [0m] [1mRunning DVC command ['dvc', 'pull', PosixPath('/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/datasets/spotlight_porto/data')][0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/dvc_manager/dvc_facade.py[0m [36mfunction_name[0m=[35m_run[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mline_number[0m=[35m31[0m [36mmodule[0m=[35mhs_brain_mlops.dataset_management.dvc_manager.dvc_facade[0m [36mpid[0m=[35m62832[0m
5369 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1031.66it/s]


antalya
[2m2023-10-18T06:58:16.053654Z[0m [[32m[1minfo     [0m] [1mRunning DVC command ['dvc', 'pull', PosixPath('/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/datasets/spotlight_antalya/data')][0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/dvc_manager/dvc_facade.py[0m [36mfunction_name[0m=[35m_run[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mline_number[0m=[35m31[0m [36mmodule[0m=[35mhs_brain_mlops.dataset_management.dvc_manager.dvc_facade[0m [36mpid[0m=[35m62832[0m
22681 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1102.31it/s]


tampa
[2m2023-10-18T06:58:31.241283Z[0m [[32m[1minfo     [0m] [1mRunning DVC command ['dvc', 'pull', PosixPath('/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/datasets/spotlight_tampa/data')][0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/dvc_manager/dvc_facade.py[0m [36mfunction_name[0m=[35m_run[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mline_number[0m=[35m31[0m [36mmodule[0m=[35mhs_brain_mlops.dataset_management.dvc_manager.dvc_facade[0m [36mpid[0m=[35m62832[0m
9141 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1136.95it/s]


moscow
[2m2023-10-18T06:58:44.826414Z[0m [[32m[1minfo     [0m] [1mRunning DVC command ['dvc', 'pull', PosixPath('/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/datasets/spotlight_moscow/data')][0m [36menvironment[0m=[35mlocal[0m [36mfile_path[0m=[35m/Users/dima/hiredscore/brain_mlops/hs_brain_mlops/dataset_management/dvc_manager/dvc_facade.py[0m [36mfunction_name[0m=[35m_run[0m [36mhostname[0m=[35mDima-Shulga-MacBook-Pro[0m [36mline_number[0m=[35m31[0m [36mmodule[0m=[35mhs_brain_mlops.dataset_management.dvc_manager.dvc_facade[0m [36mpid[0m=[35m62832[0m
14428 reqs in test set


100%|██████████| 500/500 [00:00<00:00, 1041.70it/s]


3000

In [None]:
import pickle

In [None]:
pickle.dump(data, open('statuses_acounts_data.pkl', 'wb'))

In [None]:
'Done'

In [None]:
df = pd.DataFrame(data)

In [9]:
df['status_path'].value_counts()

NameError: name 'df' is not defined

In [None]:
df.groupby(['phases_path']).size()

In [None]:
df.groupby(['account', 'new_max_status']).size()

In [None]:
(df.groupby(['account', 'max_status']).size() / df.groupby(['account']).size()).unstack(1).plot(kind='bar', figsize=(10, 5))

In [None]:
(df.groupby(['account', 'new_max_status']).size() / df.groupby(['account']).size()).unstack(1).plot(kind='bar', figsize=(10, 5))

In [None]:
ac = df[df['account'] == 'oxford']

In [None]:
ac.shape

In [None]:
ac['new_max_status'].value_counts()

In [None]:
ac['new_max_status'].value_counts()

In [None]:
ac[(ac['phases_path']  == 'hr_review->obsolete->department_interview->rejected') & (ac['new_max_status'] == -1)].head()

In [None]:
account_settings.get('ats_integration_type')

In [None]:
df.groupby(['integration', 'new_max_status']).size().unstack(1).plot(kind='bar', figsize=(10, 5))

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

In [None]:
words = text.split()

In [None]:
text = ' '.join(i.replace('->', ' ').replace('_', ' ').lower() for i in df[(df['integration'] == 'workday') & (df['max_analytics_bucket'] == 'Screening')]['statuses_names'])
stopwords = ['action', 'default', 'definition', 'application', 'step', 'job', 'review', 'to','be', 'b', 'new', 'candidate',
            'not', 'screen', 'conclusion', 'applicant', 'incomplete', 'regrets', 'sent', 'portal', 'slate', 'resume',
            'external', 'check', 'completed', 'pending', 'position']
wordcloud = WordCloud(stopwords=set(stopwords) | STOPWORDS, 
                      background_color="white", 
                      min_word_length=3,
                      collocations=False).generate(text)
plt.figure( figsize=(13,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df['req_id'] = df['app_id'].apply(lambda x: x.split('_')[0])

In [None]:
df.groupby(['account_id', 'req_id']).groupby(['account', 'max_status']).size().unstack(1).plot(kind='bar', figsize=(10, 5))

In [None]:
get_account_test_set('oxford')