In [None]:
import pandas as pd
from json2html import *
from hs_gimme.db_facade.db_facade_factory import get_mongo_client_db
from IPython.core.display import display, HTML
from hs_gimme.application_status_history_classifier.machine_learning_status_classifier import get_machine_learning_status_classifier
from hs_gimme.account_settings_manager import get_account_settings
from collections import Counter
from hs_gimme.application_status_history_classifier.new_cls import NewStatusCls

# Create data

In [None]:
orig_df = pd.read_csv('/Users/dima/Downloads/fm_for_mer/fetch_measurements.csv')
len(orig_df)

In [None]:
sorted(orig_df.columns)

In [None]:
new_df = pd.read_csv('/Users/dima/Downloads/fetch_mer4.csv')[['req_id', 'current_talent_id', 'Good fit / Bad fit - 0/1']]
new_df.head()

In [None]:
new_df = new_df.dropna(subset=['Good fit / Bad fit - 0/1'])
len(new_df)

In [None]:
df = new_df.merge(orig_df, left_on=['req_id', 'current_talent_id'], right_on=['req_id', 'current_talent_id'])
df = df[df['Good fit / Bad fit - 0/1'].isin({'0', '1'})]
df['is_good'] = df['Good fit / Bad fit - 0/1'].astype(int)
len(df)

In [None]:
df['group_name'].value_counts()

In [None]:
orig_df2 = pd.read_csv('/Users/dima/Downloads/for_mer/fetch_2/indian_moscow_junior_senior.csv')

In [None]:
new_df2 = pd.read_csv('/Users/dima/Downloads/mer5.csv')
new_df2.head()

In [None]:
new_df2 = new_df2.dropna(subset=['Good fit/ Bad fit (0/1)'])
len(new_df2)

In [None]:
df2 = new_df2.merge(orig_df2, left_on=['req id', 'talent ID'], right_on=['req_id', 'current_talent_id'])
df2 = df2[df2['Good fit/ Bad fit (0/1)'].isin({'0', '1'})]
df2['is_good'] = df2['Good fit/ Bad fit (0/1)'].astype(int)
len(df2)

In [None]:
df2['group_name'].value_counts()

In [None]:
df = pd.concat([df, df2])

In [None]:
df['account'] = df['req_id'].apply(lambda x: x[:3])

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df['account_id'].value_counts()

In [None]:
mongos = {}
account_settings = {}
status_cls = {}
new_cls = {}
prod_mongos = {}
for account_id in df['account_id'].unique():
    print(account_id)
    mongos[account_id] = get_mongo_client_db('production_qa', account_id)
    prod_mongos[account_id] = get_mongo_client_db('production', account_id)
    account_settings[account_id] =  get_account_settings(account_id)
    status_cls[account_id] = get_machine_learning_status_classifier('production', account_id)
    new_cls[account_id] = NewStatusCls(account_id, 'production')

In [None]:
def get_app(r):
    req_id = r['req_id']
    talent_id = r['current_talent_id']
    account = r['account_id']
    mongo = mongos[account]
    prod_mongo = prod_mongos[account]
    
    app = mongo.application.find_one({'req_id': req_id, 'current_talent_id': talent_id}) or prod_mongo.application.find_one({'req_id': req_id, 'current_talent_id': talent_id})
    
    return app

In [None]:
df['app'] = df.apply(get_app, axis=1)

In [None]:
df['max_status'] = df.apply(lambda r: status_cls[r['account_id']].get_max_status(r['app']), axis=1)
df['max_status'].value_counts()

In [None]:
df['app'].isna().sum(), df['app'].isna().mean()

In [None]:
df = df.dropna(subset=['app'])

In [None]:
df.to_pickle('tagging_analysis_df.pkl')

# Analysis

In [None]:
df = pd.read_pickle('tagging_analysis_df.pkl')

In [None]:
mongos = {}
account_settings = {}
status_cls = {}
new_cls = {}
prod_mongos = {}
for account_id in df['account_id'].unique():
    print(account_id)
    mongos[account_id] = get_mongo_client_db('production_qa', account_id)
    prod_mongos[account_id] = get_mongo_client_db('production', account_id)
    account_settings[account_id] =  get_account_settings(account_id)
    status_cls[account_id] = get_machine_learning_status_classifier('production', account_id)
    new_cls[account_id] = NewStatusCls(account_id, 'production')

In [None]:
def get_phases(row):
    account_id = row['account_id']
    app = row['app']
    return '->'.join(k for k, v in sorted(status_cls[account_id].get_phases_reach_dates(app).items(), key=lambda x: x[1]))

In [None]:
def get_statuses(app):
    statuses = app['ats_application']['status_info']['status_history']
    status_names = ['_'.join(s['codes'].values()) for s in statuses]
    return '->'.join(status_names)

def get_statuses_names(app):
    statuses = app['ats_application']['status_info']['status_history']
    try:
        status_names = [s.get('status_name') or s.get('step_name') for s in statuses]
    except:
        print(statuses)
        raise
    return '->'.join(status_names)

In [None]:
df['phases'] = df.apply(get_phases, axis=1)

In [None]:
df['statuses_names'] = df['app'].apply(get_statuses_names)

In [None]:
df['statuses'] = df['app'].apply(get_statuses)

In [None]:
df['new_max_status'] = df.apply(lambda r: new_cls[r['account_id']].get_max_status(r['app']), axis=1)
df['new_max_status'].value_counts()

In [None]:
df.groupby(['max_status', 'is_good']).size() / df.groupby('max_status').size()

In [None]:
df.groupby(['new_max_status', 'is_good']).size() / df.groupby('new_max_status').size()

In [None]:
df[(df['new_max_status'] == -1) & (df['is_good'] == 0)]['phases'].value_counts()

In [None]:
df[(df['new_max_status'] == 4) & (df['is_good'] == 0)]['phases'].value_counts()

In [None]:
def get_new_group(new_max_status):
    if new_max_status == 0:
        return 'Negative Rejected'
    
    if new_max_status >= 3: 
        return 'Positive Offer+ With Experience'
    
    return 'Other'

In [None]:
df['new_group_name'] = df['new_max_status'].apply(get_new_group)

In [None]:
df.groupby(['new_group_name'])['is_good'].agg(['count', 'mean'])

In [None]:
df.groupby(['group_name',])['is_good'].agg(['count', 'mean'])

# Investigation

In [None]:
df.groupby(['max_status', 'new_max_status'])['is_good'].agg(['count', 'mean'])

In [None]:
df[(df['new_max_status'] == 0) & (df['is_good'] == 1)]['statuses_names'].value_counts()

In [None]:
df[(df['max_status'] == 4) &  (df['new_max_status'] == -1) & (df['is_good'] == 1) & (df['phases'] == 'review->interview->offer->hire')][['req_id', 'current_talent_id', 'phases']]

In [None]:
df[df['statuses_names'] == 'JOB_APPLICATION_DEFAULT_DEFINITION_STEP_B__ACTION->JOB_APPLICATION_DEFAULT_DEFINITION_STEP_P_REVIEW_CONCLUSION:_DOES_NOT_MEET_MINIMUM_REQUIREMENTS_FOR_THE_POSITION_(NOTIFICATION_WILL_BE_SENT)']['is_good'].mean()

In [None]:
df[df['statuses_names'] == 'JOB_APPLICATION_DEFAULT_DEFINITION_STEP_B__ACTION->JOB_APPLICATION_DEFAULT_DEFINITION_STEP_P_REVIEW_CONCLUSION:_DOES_NOT_MEET_MINIMUM_REQUIREMENTS_FOR_THE_POSITION_(NOTIFICATION_WILL_BE_SENT)'][['req_id', 'current_talent_id', 'phases']]

In [None]:
df[(df['new_max_status'] == 0)].groupby('statuses_names')['is_good'].agg(['count', 'mean']).sort_values('count', ascending=False)

In [None]:
df.columns

In [None]:
req_seniority_level

In [None]:
df.groupby(['req_seniority_level', 'group_name'])['is_good'].agg(['count', 'mean'])

In [None]:
df.groupby(['req_seniority_level', 'new_group_name'])['is_good'].agg(['count', 'mean'])